osfmk/arm/locks_arm.c

   1 /*
   2  * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System Copyright (c) 1991,1990,1989,1988,1987 Carnegie
  33  * Mellon University All Rights Reserved.
  34  *
  35  * Permission to use, copy, modify and distribute this software and its
  36  * documentation is hereby granted, provided that both the copyright notice
  37  * and this permission notice appear in all copies of the software,
  38  * derivative works or modified versions, and any portions thereof, and that
  39  * both notices appear in supporting documentation.
  40  *
  41  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.
  42  * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES
  43  * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  44  *
  45  * Carnegie Mellon requests users of this software to return to
  46  *
  47  * Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  48  * School of Computer Science Carnegie Mellon University Pittsburgh PA
  49  * 15213-3890
  50  *
  51  * any improvements or extensions that they make and grant Carnegie Mellon the
  52  * rights to redistribute these changes.
  53  */
  54 /*
  55  *      File:   kern/lock.c
  56  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  57  *      Date:   1985
  58  *
  59  *      Locking primitives implementation
  60  */
  61
  62 #define LOCK_PRIVATE 1
  63
  64 #include <mach_ldebug.h>
  65
  66 #include <kern/kalloc.h>
  67 #include <kern/lock_stat.h>
  68 #include <kern/locks.h>
  69 #include <kern/misc_protos.h>
  70 #include <kern/thread.h>
  71 #include <kern/processor.h>
  72 #include <kern/sched_prim.h>
  73 #include <kern/debug.h>
  74 #include <kern/kcdata.h>
  75 #include <string.h>
  76 #include <arm/cpu_internal.h>
  77 #include <os/hash.h>
  78 #include <arm/cpu_data.h>
  79
  80 #include <arm/cpu_data_internal.h>
  81 #include <arm/proc_reg.h>
  82 #include <arm/smp.h>
  83 #include <machine/atomic.h>
  84 #include <machine/machine_cpu.h>
  85
  86 #include <sys/kdebug.h>
  87
  88 #if CONFIG_DTRACE
  89 #define DTRACE_RW_SHARED        0x0     //reader
  90 #define DTRACE_RW_EXCL          0x1     //writer
  91 #define DTRACE_NO_FLAG          0x0     //not applicable
  92 #endif  /* CONFIG_DTRACE */
  93
  94 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  95 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
  96 #define LCK_RW_LCK_SHARED_CODE          0x102
  97 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
  98 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
  99 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 100
 101
 102 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 103
 104 // Panic in tests that check lock usage correctness
 105 // These are undesirable when in a panic or a debugger is runnning.
 106 #define LOCK_CORRECTNESS_PANIC() (kernel_debugger_entry_count == 0)
 107
 108 unsigned int    LcksOpts = 0;
 109
 110 #define ADAPTIVE_SPIN_ENABLE 0x1
 111
 112 #if __SMP__
 113 int lck_mtx_adaptive_spin_mode = ADAPTIVE_SPIN_ENABLE;
 114 #else /* __SMP__ */
 115 int lck_mtx_adaptive_spin_mode = 0;
 116 #endif /* __SMP__ */
 117
 118 #define SPINWAIT_OWNER_CHECK_COUNT 4
 119
 120 typedef enum {
 121         SPINWAIT_ACQUIRED,     /* Got the lock. */
 122         SPINWAIT_INTERLOCK,    /* Got the interlock, no owner, but caller must finish acquiring the lock. */
 123         SPINWAIT_DID_SPIN_HIGH_THR, /* Got the interlock, spun, but failed to get the lock. */
 124         SPINWAIT_DID_SPIN_OWNER_NOT_CORE, /* Got the interlock, spun, but failed to get the lock. */
 125         SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION, /* Got the interlock, spun, but failed to get the lock. */
 126         SPINWAIT_DID_SPIN_SLIDING_THR,/* Got the interlock, spun, but failed to get the lock. */
 127         SPINWAIT_DID_NOT_SPIN, /* Got the interlock, did not spin. */
 128 } spinwait_result_t;
 129
 130 #if CONFIG_DTRACE && __SMP__
 131 extern uint64_t dtrace_spin_threshold;
 132 #endif
 133
 134 /* Forwards */
 135
 136 extern unsigned int not_in_kdp;
 137
 138 /*
 139  *      We often want to know the addresses of the callers
 140  *      of the various lock routines.  However, this information
 141  *      is only used for debugging and statistics.
 142  */
 143 typedef void   *pc_t;
 144 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 145 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 146
 147 #ifdef  lint
 148 /*
 149  *      Eliminate lint complaints about unused local pc variables.
 150  */
 151 #define OBTAIN_PC(pc, l) ++pc
 152 #else                           /* lint */
 153 #define OBTAIN_PC(pc, l)
 154 #endif                          /* lint */
 155
 156
 157 /*
 158  *      Portable lock package implementation of usimple_locks.
 159  */
 160
 161 /*
 162  * Owner thread pointer when lock held in spin mode
 163  */
 164 #define LCK_MTX_SPIN_TAG  0xfffffff0
 165
 166
 167 #define interlock_lock(lock)    hw_lock_bit    ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
 168 #define interlock_try(lock)             hw_lock_bit_try((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
 169 #define interlock_unlock(lock)  hw_unlock_bit  ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
 170 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
 171 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
 172
 173 #define load_memory_barrier()   os_atomic_thread_fence(acquire)
 174
 175 // Enforce program order of loads and stores.
 176 #define ordered_load(target) \
 177                 os_atomic_load(target, compiler_acq_rel)
 178 #define ordered_store(target, value) \
 179                 os_atomic_store(target, value, compiler_acq_rel)
 180
 181 #define ordered_load_mtx(lock)                  ordered_load(&(lock)->lck_mtx_data)
 182 #define ordered_store_mtx(lock, value)  ordered_store(&(lock)->lck_mtx_data, (value))
 183 #define ordered_load_rw(lock)                   ordered_load(&(lock)->lck_rw_data)
 184 #define ordered_store_rw(lock, value)   ordered_store(&(lock)->lck_rw_data, (value))
 185 #define ordered_load_rw_owner(lock)             ordered_load(&(lock)->lck_rw_owner)
 186 #define ordered_store_rw_owner(lock, value)     ordered_store(&(lock)->lck_rw_owner, (value))
 187 #define ordered_load_hw(lock)                   ordered_load(&(lock)->lock_data)
 188 #define ordered_store_hw(lock, value)   ordered_store(&(lock)->lock_data, (value))
 189 #define ordered_load_bit(lock)                  ordered_load((lock))
 190 #define ordered_store_bit(lock, value)  ordered_store((lock), (value))
 191
 192
 193 // Prevent the compiler from reordering memory operations around this
 194 #define compiler_memory_fence() __asm__ volatile ("" ::: "memory")
 195
 196 #define LOCK_PANIC_TIMEOUT      0xc00000
 197 #define NOINLINE                __attribute__((noinline))
 198
 199
 200 #if __arm__
 201 #define interrupts_disabled(mask) (mask & PSR_INTMASK)
 202 #else
 203 #define interrupts_disabled(mask) (mask & DAIF_IRQF)
 204 #endif
 205
 206
 207 #if __arm__
 208 #define enable_fiq()            __asm__ volatile ("cpsie  f" ::: "memory");
 209 #define enable_interrupts()     __asm__ volatile ("cpsie if" ::: "memory");
 210 #endif
 211
 212 /*
 213  * Forward declarations
 214  */
 215
 216 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
 217 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
 218 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 219 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 220 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 221 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 222 static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait);
 223
 224 /*
 225  * atomic exchange API is a low level abstraction of the operations
 226  * to atomically read, modify, and write a pointer.  This abstraction works
 227  * for both Intel and ARMv8.1 compare and exchange atomic instructions as
 228  * well as the ARM exclusive instructions.
 229  *
 230  * atomic_exchange_begin() - begin exchange and retrieve current value
 231  * atomic_exchange_complete() - conclude an exchange
 232  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
 233  */
 234 __unused static uint32_t
 235 load_exclusive32(uint32_t *target, enum memory_order ord)
 236 {
 237         uint32_t        value;
 238
 239 #if __arm__
 240         if (memory_order_has_release(ord)) {
 241                 // Pre-load release barrier
 242                 atomic_thread_fence(memory_order_release);
 243         }
 244         value = __builtin_arm_ldrex(target);
 245 #else
 246         if (memory_order_has_acquire(ord)) {
 247                 value = __builtin_arm_ldaex(target);    // ldaxr
 248         } else {
 249                 value = __builtin_arm_ldrex(target);    // ldxr
 250         }
 251 #endif  // __arm__
 252         return value;
 253 }
 254
 255 __unused static boolean_t
 256 store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord)
 257 {
 258         boolean_t err;
 259
 260 #if __arm__
 261         err = __builtin_arm_strex(value, target);
 262         if (memory_order_has_acquire(ord)) {
 263                 // Post-store acquire barrier
 264                 atomic_thread_fence(memory_order_acquire);
 265         }
 266 #else
 267         if (memory_order_has_release(ord)) {
 268                 err = __builtin_arm_stlex(value, target);       // stlxr
 269         } else {
 270                 err = __builtin_arm_strex(value, target);       // stxr
 271         }
 272 #endif  // __arm__
 273         return !err;
 274 }
 275
 276 static uint32_t
 277 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 278 {
 279         uint32_t        val;
 280
 281 #if __ARM_ATOMICS_8_1
 282         ord = memory_order_relaxed;
 283 #endif
 284         val = load_exclusive32(target, ord);
 285         *previous = val;
 286         return val;
 287 }
 288
 289 static boolean_t
 290 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 291 {
 292 #if __ARM_ATOMICS_8_1
 293         return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
 294 #else
 295         (void)previous;         // Previous not needed, monitor is held
 296         return store_exclusive32(target, newval, ord);
 297 #endif
 298 }
 299
 300 static void
 301 atomic_exchange_abort(void)
 302 {
 303         os_atomic_clear_exclusive();
 304 }
 305
 306 static boolean_t
 307 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 308 {
 309         uint32_t                value, prev;
 310
 311         for (;;) {
 312                 value = atomic_exchange_begin32(target, &prev, ord);
 313                 if (value & test_mask) {
 314                         if (wait) {
 315                                 wait_for_event();       // Wait with monitor held
 316                         } else {
 317                                 atomic_exchange_abort();        // Clear exclusive monitor
 318                         }
 319                         return FALSE;
 320                 }
 321                 value |= set_mask;
 322                 if (atomic_exchange_complete32(target, prev, value, ord)) {
 323                         return TRUE;
 324                 }
 325         }
 326 }
 327
 328 inline boolean_t
 329 hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 330 {
 331         return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
 332 }
 333
 334 void
 335 _disable_preemption(void)
 336 {
 337         thread_t     thread = current_thread();
 338         unsigned int count  = thread->machine.preemption_count;
 339
 340         count += 1;
 341         if (__improbable(count == 0)) {
 342                 panic("Preemption count overflow");
 343         }
 344
 345         os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
 346 }
 347
 348 /*
 349  * This function checks whether an AST_URGENT has been pended.
 350  *
 351  * It is called once the preemption has been reenabled, which means the thread
 352  * may have been preempted right before this was called, and when this function
 353  * actually performs the check, we've changed CPU.
 354  *
 355  * This race is however benign: the point of AST_URGENT is to trigger a context
 356  * switch, so if one happened, there's nothing left to check for, and AST_URGENT
 357  * was cleared in the process.
 358  *
 359  * It follows that this check cannot have false negatives, which allows us
 360  * to avoid fiddling with interrupt state for the vast majority of cases
 361  * when the check will actually be negative.
 362  */
 363 static NOINLINE void
 364 kernel_preempt_check(thread_t thread)
 365 {
 366         cpu_data_t *cpu_data_ptr;
 367         long        state;
 368
 369 #if __arm__
 370 #define INTERRUPT_MASK PSR_IRQF
 371 #else   // __arm__
 372 #define INTERRUPT_MASK DAIF_IRQF
 373 #endif  // __arm__
 374
 375         /*
 376          * This check is racy and could load from another CPU's pending_ast mask,
 377          * but as described above, this can't have false negatives.
 378          */
 379         cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
 380         if (__probable((cpu_data_ptr->cpu_pending_ast & AST_URGENT) == 0)) {
 381                 return;
 382         }
 383
 384         /* If interrupts are masked, we can't take an AST here */
 385         state = get_interrupts();
 386         if ((state & INTERRUPT_MASK) == 0) {
 387                 disable_interrupts_noread();                    // Disable interrupts
 388
 389                 /*
 390                  * Reload cpu_data_ptr: a context switch would cause it to change.
 391                  * Now that interrupts are disabled, this will debounce false positives.
 392                  */
 393                 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
 394                 if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
 395 #if __arm__
 396 #if __ARM_USER_PROTECT__
 397                         uintptr_t up = arm_user_protect_begin(thread);
 398 #endif  // __ARM_USER_PROTECT__
 399                         enable_fiq();
 400 #endif  // __arm__
 401                         ast_taken_kernel();                 // Handle urgent AST
 402 #if __arm__
 403 #if __ARM_USER_PROTECT__
 404                         arm_user_protect_end(thread, up, TRUE);
 405 #endif  // __ARM_USER_PROTECT__
 406                         enable_interrupts();
 407                         return;                             // Return early on arm only due to FIQ enabling
 408 #endif  // __arm__
 409                 }
 410                 restore_interrupts(state);              // Enable interrupts
 411         }
 412 }
 413
 414 void
 415 _enable_preemption(void)
 416 {
 417         thread_t     thread = current_thread();
 418         unsigned int count  = thread->machine.preemption_count;
 419
 420         if (__improbable(count == 0)) {
 421                 panic("Preemption count underflow");
 422         }
 423         count -= 1;
 424
 425         os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
 426         if (count == 0) {
 427                 kernel_preempt_check(thread);
 428         }
 429 }
 430
 431 int
 432 get_preemption_level(void)
 433 {
 434         return current_thread()->machine.preemption_count;
 435 }
 436
 437 /*
 438  *      Routine:        lck_spin_alloc_init
 439  */
 440 lck_spin_t     *
 441 lck_spin_alloc_init(
 442         lck_grp_t * grp,
 443         lck_attr_t * attr)
 444 {
 445         lck_spin_t     *lck;
 446
 447         if ((lck = (lck_spin_t *) kalloc(sizeof(lck_spin_t))) != 0) {
 448                 lck_spin_init(lck, grp, attr);
 449         }
 450
 451         return lck;
 452 }
 453
 454 /*
 455  *      Routine:        lck_spin_free
 456  */
 457 void
 458 lck_spin_free(
 459         lck_spin_t * lck,
 460         lck_grp_t * grp)
 461 {
 462         lck_spin_destroy(lck, grp);
 463         kfree(lck, sizeof(lck_spin_t));
 464 }
 465
 466 /*
 467  *      Routine:        lck_spin_init
 468  */
 469 void
 470 lck_spin_init(
 471         lck_spin_t * lck,
 472         lck_grp_t * grp,
 473         __unused lck_attr_t * attr)
 474 {
 475         lck->type = LCK_SPIN_TYPE;
 476         hw_lock_init(&lck->hwlock);
 477         if (grp) {
 478                 lck_grp_reference(grp);
 479                 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 480         }
 481 }
 482
 483 /*
 484  * arm_usimple_lock is a lck_spin_t without a group or attributes
 485  */
 486 void inline
 487 arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
 488 {
 489         lck->type = LCK_SPIN_TYPE;
 490         hw_lock_init(&lck->hwlock);
 491 }
 492
 493
 494 /*
 495  *      Routine:        lck_spin_lock
 496  */
 497 void
 498 lck_spin_lock(lck_spin_t *lock)
 499 {
 500 #if     DEVELOPMENT || DEBUG
 501         if (lock->type != LCK_SPIN_TYPE) {
 502                 panic("Invalid spinlock %p", lock);
 503         }
 504 #endif  // DEVELOPMENT || DEBUG
 505         hw_lock_lock(&lock->hwlock, LCK_GRP_NULL);
 506 }
 507
 508 void
 509 lck_spin_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
 510 {
 511 #pragma unused(grp)
 512 #if     DEVELOPMENT || DEBUG
 513         if (lock->type != LCK_SPIN_TYPE) {
 514                 panic("Invalid spinlock %p", lock);
 515         }
 516 #endif  // DEVELOPMENT || DEBUG
 517         hw_lock_lock(&lock->hwlock, grp);
 518 }
 519
 520 /*
 521  *      Routine:        lck_spin_lock_nopreempt
 522  */
 523 void
 524 lck_spin_lock_nopreempt(lck_spin_t *lock)
 525 {
 526 #if     DEVELOPMENT || DEBUG
 527         if (lock->type != LCK_SPIN_TYPE) {
 528                 panic("Invalid spinlock %p", lock);
 529         }
 530 #endif  // DEVELOPMENT || DEBUG
 531         hw_lock_lock_nopreempt(&lock->hwlock, LCK_GRP_NULL);
 532 }
 533
 534 void
 535 lck_spin_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
 536 {
 537 #pragma unused(grp)
 538 #if     DEVELOPMENT || DEBUG
 539         if (lock->type != LCK_SPIN_TYPE) {
 540                 panic("Invalid spinlock %p", lock);
 541         }
 542 #endif  // DEVELOPMENT || DEBUG
 543         hw_lock_lock_nopreempt(&lock->hwlock, grp);
 544 }
 545
 546 /*
 547  *      Routine:        lck_spin_try_lock
 548  */
 549 int
 550 lck_spin_try_lock(lck_spin_t *lock)
 551 {
 552         return hw_lock_try(&lock->hwlock, LCK_GRP_NULL);
 553 }
 554
 555 int
 556 lck_spin_try_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
 557 {
 558 #pragma unused(grp)
 559         return hw_lock_try(&lock->hwlock, grp);
 560 }
 561
 562 /*
 563  *      Routine:        lck_spin_try_lock_nopreempt
 564  */
 565 int
 566 lck_spin_try_lock_nopreempt(lck_spin_t *lock)
 567 {
 568         return hw_lock_try_nopreempt(&lock->hwlock, LCK_GRP_NULL);
 569 }
 570
 571 int
 572 lck_spin_try_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
 573 {
 574 #pragma unused(grp)
 575         return hw_lock_try_nopreempt(&lock->hwlock, grp);
 576 }
 577
 578 /*
 579  *      Routine:        lck_spin_unlock
 580  */
 581 void
 582 lck_spin_unlock(lck_spin_t *lock)
 583 {
 584 #if     DEVELOPMENT || DEBUG
 585         if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
 586                 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
 587         }
 588         if (lock->type != LCK_SPIN_TYPE) {
 589                 panic("Invalid spinlock type %p", lock);
 590         }
 591 #endif  // DEVELOPMENT || DEBUG
 592         hw_lock_unlock(&lock->hwlock);
 593 }
 594
 595 /*
 596  *      Routine:        lck_spin_unlock_nopreempt
 597  */
 598 void
 599 lck_spin_unlock_nopreempt(lck_spin_t *lock)
 600 {
 601 #if     DEVELOPMENT || DEBUG
 602         if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
 603                 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
 604         }
 605         if (lock->type != LCK_SPIN_TYPE) {
 606                 panic("Invalid spinlock type %p", lock);
 607         }
 608 #endif  // DEVELOPMENT || DEBUG
 609         hw_lock_unlock_nopreempt(&lock->hwlock);
 610 }
 611
 612 /*
 613  *      Routine:        lck_spin_destroy
 614  */
 615 void
 616 lck_spin_destroy(
 617         lck_spin_t * lck,
 618         lck_grp_t * grp)
 619 {
 620         if (lck->lck_spin_data == LCK_SPIN_TAG_DESTROYED) {
 621                 return;
 622         }
 623         lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED;
 624         if (grp) {
 625                 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 626                 lck_grp_deallocate(grp);
 627         }
 628 }
 629
 630 /*
 631  * Routine: kdp_lck_spin_is_acquired
 632  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 633  */
 634 boolean_t
 635 kdp_lck_spin_is_acquired(lck_spin_t *lck)
 636 {
 637         if (not_in_kdp) {
 638                 panic("panic: spinlock acquired check done outside of kernel debugger");
 639         }
 640         return ((lck->lck_spin_data & ~LCK_SPIN_TAG_DESTROYED) != 0) ? TRUE:FALSE;
 641 }
 642
 643 /*
 644  *      Initialize a usimple_lock.
 645  *
 646  *      No change in preemption state.
 647  */
 648 void
 649 usimple_lock_init(
 650         usimple_lock_t l,
 651         unsigned short tag)
 652 {
 653         simple_lock_init((simple_lock_t) l, tag);
 654 }
 655
 656
 657 /*
 658  *      Acquire a usimple_lock.
 659  *
 660  *      Returns with preemption disabled.  Note
 661  *      that the hw_lock routines are responsible for
 662  *      maintaining preemption state.
 663  */
 664 void
 665 (usimple_lock)(
 666         usimple_lock_t l
 667         LCK_GRP_ARG(lck_grp_t *grp))
 668 {
 669         simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp));
 670 }
 671
 672
 673 extern void     sync(void);
 674
 675 /*
 676  *      Release a usimple_lock.
 677  *
 678  *      Returns with preemption enabled.  Note
 679  *      that the hw_lock routines are responsible for
 680  *      maintaining preemption state.
 681  */
 682 void
 683 (usimple_unlock)(
 684         usimple_lock_t l)
 685 {
 686         simple_unlock((simple_lock_t)l);
 687 }
 688
 689
 690 /*
 691  *      Conditionally acquire a usimple_lock.
 692  *
 693  *      On success, returns with preemption disabled.
 694  *      On failure, returns with preemption in the same state
 695  *      as when first invoked.  Note that the hw_lock routines
 696  *      are responsible for maintaining preemption state.
 697  *
 698  *      XXX No stats are gathered on a miss; I preserved this
 699  *      behavior from the original assembly-language code, but
 700  *      doesn't it make sense to log misses?  XXX
 701  */
 702 unsigned
 703 int
 704 (usimple_lock_try)(
 705         usimple_lock_t l
 706         LCK_GRP_ARG(lck_grp_t *grp))
 707 {
 708         return simple_lock_try((simple_lock_t) l, grp);
 709 }
 710
 711 /*
 712  * The C portion of the shared/exclusive locks package.
 713  */
 714
 715 /*
 716  * compute the deadline to spin against when
 717  * waiting for a change of state on a lck_rw_t
 718  */
 719 #if     __SMP__
 720 static inline uint64_t
 721 lck_rw_deadline_for_spin(lck_rw_t *lck)
 722 {
 723         lck_rw_word_t   word;
 724
 725         word.data = ordered_load_rw(lck);
 726         if (word.can_sleep) {
 727                 if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
 728                         /*
 729                          * there are already threads waiting on this lock... this
 730                          * implies that they have spun beyond their deadlines waiting for
 731                          * the desired state to show up so we will not bother spinning at this time...
 732                          *   or
 733                          * the current number of threads sharing this lock exceeds our capacity to run them
 734                          * concurrently and since all states we're going to spin for require the rw_shared_count
 735                          * to be at 0, we'll not bother spinning since the latency for this to happen is
 736                          * unpredictable...
 737                          */
 738                         return mach_absolute_time();
 739                 }
 740                 return mach_absolute_time() + MutexSpin;
 741         } else {
 742                 return mach_absolute_time() + (100000LL * 1000000000LL);
 743         }
 744 }
 745 #endif  // __SMP__
 746
 747 static boolean_t
 748 lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unused)
 749 {
 750 #if     __SMP__
 751         uint64_t        deadline = 0;
 752         uint32_t        data;
 753
 754         if (wait) {
 755                 deadline = lck_rw_deadline_for_spin(lock);
 756         }
 757
 758         for (;;) {
 759                 data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
 760                 if ((data & status_mask) == 0) {
 761                         break;
 762                 }
 763                 if (wait) {
 764                         wait_for_event();
 765                 } else {
 766                         os_atomic_clear_exclusive();
 767                 }
 768                 if (!wait || (mach_absolute_time() >= deadline)) {
 769                         return FALSE;
 770                 }
 771         }
 772         os_atomic_clear_exclusive();
 773         return TRUE;
 774 #else
 775         uint32_t        data;
 776
 777         data = ordered_load_rw(lock);
 778         if ((data & status_mask) == 0) {
 779                 return TRUE;
 780         } else {
 781                 return FALSE;
 782         }
 783 #endif  // __SMP__
 784 }
 785
 786 /*
 787  * Spin while interlock is held.
 788  */
 789 static inline void
 790 lck_rw_interlock_spin(lck_rw_t *lock)
 791 {
 792 #if __SMP__
 793         uint32_t        data;
 794
 795         for (;;) {
 796                 data = load_exclusive32(&lock->lck_rw_data, memory_order_relaxed);
 797                 if (data & LCK_RW_INTERLOCK) {
 798                         wait_for_event();
 799                 } else {
 800                         os_atomic_clear_exclusive();
 801                         return;
 802                 }
 803         }
 804 #else
 805         panic("lck_rw_interlock_spin(): Interlock locked %p %x", lock, lock->lck_rw_data);
 806 #endif
 807 }
 808
 809 /*
 810  * We disable interrupts while holding the RW interlock to prevent an
 811  * interrupt from exacerbating hold time.
 812  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 813  */
 814 static inline boolean_t
 815 lck_interlock_lock(lck_rw_t *lck)
 816 {
 817         boolean_t       istate;
 818
 819         istate = ml_set_interrupts_enabled(FALSE);
 820         lck_rw_ilk_lock(lck);
 821         return istate;
 822 }
 823
 824 static inline void
 825 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 826 {
 827         lck_rw_ilk_unlock(lck);
 828         ml_set_interrupts_enabled(istate);
 829 }
 830
 831
 832 #define LCK_RW_GRAB_WANT        0
 833 #define LCK_RW_GRAB_SHARED      1
 834
 835 static boolean_t
 836 lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait)
 837 {
 838         uint64_t        deadline = 0;
 839         uint32_t        data, prev;
 840         boolean_t       do_exch;
 841
 842 #if __SMP__
 843         if (wait) {
 844                 deadline = lck_rw_deadline_for_spin(lock);
 845         }
 846 #else
 847         wait = FALSE;   // Don't spin on UP systems
 848 #endif
 849
 850         for (;;) {
 851                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
 852                 if (data & LCK_RW_INTERLOCK) {
 853                         atomic_exchange_abort();
 854                         lck_rw_interlock_spin(lock);
 855                         continue;
 856                 }
 857                 do_exch = FALSE;
 858                 if (mode == LCK_RW_GRAB_WANT) {
 859                         if ((data & LCK_RW_WANT_EXCL) == 0) {
 860                                 data |= LCK_RW_WANT_EXCL;
 861                                 do_exch = TRUE;
 862                         }
 863                 } else {        // LCK_RW_GRAB_SHARED
 864                         if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
 865                             (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
 866                                 data += LCK_RW_SHARED_READER;
 867                                 do_exch = TRUE;
 868                         }
 869                 }
 870                 if (do_exch) {
 871                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
 872                                 return TRUE;
 873                         }
 874                 } else {
 875                         if (wait) {                                             // Non-waiting
 876                                 wait_for_event();
 877                         } else {
 878                                 atomic_exchange_abort();
 879                         }
 880                         if (!wait || (mach_absolute_time() >= deadline)) {
 881                                 return FALSE;
 882                         }
 883                 }
 884         }
 885 }
 886
 887
 888 /*
 889  *      Routine:        lck_rw_alloc_init
 890  */
 891 lck_rw_t *
 892 lck_rw_alloc_init(
 893         lck_grp_t       *grp,
 894         lck_attr_t      *attr)
 895 {
 896         lck_rw_t        *lck;
 897
 898         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 899                 lck_rw_init(lck, grp, attr);
 900         }
 901
 902         return lck;
 903 }
 904
 905 /*
 906  *      Routine:        lck_rw_free
 907  */
 908 void
 909 lck_rw_free(
 910         lck_rw_t        *lck,
 911         lck_grp_t       *grp)
 912 {
 913         lck_rw_destroy(lck, grp);
 914         kfree(lck, sizeof(lck_rw_t));
 915 }
 916
 917 /*
 918  *      Routine:        lck_rw_init
 919  */
 920 void
 921 lck_rw_init(
 922         lck_rw_t        *lck,
 923         lck_grp_t       *grp,
 924         lck_attr_t      *attr)
 925 {
 926         if (attr == LCK_ATTR_NULL) {
 927                 attr = &LockDefaultLckAttr;
 928         }
 929         memset(lck, 0, sizeof(lck_rw_t));
 930         lck->lck_rw_can_sleep = TRUE;
 931         if ((attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY) == 0) {
 932                 lck->lck_rw_priv_excl = TRUE;
 933         }
 934
 935         lck_grp_reference(grp);
 936         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 937 }
 938
 939
 940 /*
 941  *      Routine:        lck_rw_destroy
 942  */
 943 void
 944 lck_rw_destroy(
 945         lck_rw_t        *lck,
 946         lck_grp_t       *grp)
 947 {
 948         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
 949                 return;
 950         }
 951 #if MACH_LDEBUG
 952         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 953 #endif
 954         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 955         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 956         lck_grp_deallocate(grp);
 957         return;
 958 }
 959
 960 /*
 961  *      Routine:        lck_rw_lock
 962  */
 963 void
 964 lck_rw_lock(
 965         lck_rw_t                *lck,
 966         lck_rw_type_t   lck_rw_type)
 967 {
 968         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
 969                 lck_rw_lock_shared(lck);
 970         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
 971                 lck_rw_lock_exclusive(lck);
 972         } else {
 973                 panic("lck_rw_lock(): Invalid RW lock type: %x", lck_rw_type);
 974         }
 975 }
 976
 977 /*
 978  *      Routine:        lck_rw_lock_exclusive
 979  */
 980 void
 981 lck_rw_lock_exclusive(lck_rw_t *lock)
 982 {
 983         thread_t        thread = current_thread();
 984
 985         thread->rwlock_count++;
 986         if (atomic_test_and_set32(&lock->lck_rw_data,
 987             (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
 988             LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
 989 #if     CONFIG_DTRACE
 990                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
 991 #endif  /* CONFIG_DTRACE */
 992         } else {
 993                 lck_rw_lock_exclusive_gen(lock);
 994         }
 995 #if MACH_ASSERT
 996         thread_t owner = ordered_load_rw_owner(lock);
 997         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
 998 #endif
 999         ordered_store_rw_owner(lock, thread);
1000 }
1001
1002 /*
1003  *      Routine:        lck_rw_lock_shared
1004  */
1005 void
1006 lck_rw_lock_shared(lck_rw_t *lock)
1007 {
1008         uint32_t        data, prev;
1009
1010         current_thread()->rwlock_count++;
1011         for (;;) {
1012                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1013                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1014                         atomic_exchange_abort();
1015                         lck_rw_lock_shared_gen(lock);
1016                         break;
1017                 }
1018                 data += LCK_RW_SHARED_READER;
1019                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1020                         break;
1021                 }
1022                 cpu_pause();
1023         }
1024 #if MACH_ASSERT
1025         thread_t owner = ordered_load_rw_owner(lock);
1026         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1027 #endif
1028 #if     CONFIG_DTRACE
1029         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1030 #endif  /* CONFIG_DTRACE */
1031         return;
1032 }
1033
1034 /*
1035  *      Routine:        lck_rw_lock_shared_to_exclusive
1036  *
1037  *      False returned upon failure, in this case the shared lock is dropped.
1038  */
1039 boolean_t
1040 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1041 {
1042         uint32_t        data, prev;
1043
1044         for (;;) {
1045                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1046                 if (data & LCK_RW_INTERLOCK) {
1047                         atomic_exchange_abort();
1048                         lck_rw_interlock_spin(lock);
1049                         continue;
1050                 }
1051                 if (data & LCK_RW_WANT_UPGRADE) {
1052                         data -= LCK_RW_SHARED_READER;
1053                         if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1054                                 data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1055                         }
1056                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1057                                 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1058                         }
1059                 } else {
1060                         data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1061                         data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1062                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1063                                 break;
1064                         }
1065                 }
1066                 cpu_pause();
1067         }
1068         /* we now own the WANT_UPGRADE */
1069         if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1070                 lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1071         }
1072 #if MACH_ASSERT
1073         thread_t owner = ordered_load_rw_owner(lock);
1074         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1075 #endif
1076         ordered_store_rw_owner(lock, current_thread());
1077 #if     CONFIG_DTRACE
1078         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1079 #endif  /* CONFIG_DTRACE */
1080         return TRUE;
1081 }
1082
1083
1084 /*
1085  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1086  *      Function:
1087  *              Fast path code has already dropped our read
1088  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1089  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1090  *              all we need to do here is determine if a wakeup is needed
1091  */
1092 static boolean_t
1093 lck_rw_lock_shared_to_exclusive_failure(
1094         lck_rw_t        *lck,
1095         uint32_t        prior_lock_state)
1096 {
1097         thread_t        thread = current_thread();
1098         uint32_t        rwlock_count;
1099
1100         /* Check if dropping the lock means that we need to unpromote */
1101         rwlock_count = thread->rwlock_count--;
1102 #if MACH_LDEBUG
1103         if (rwlock_count == 0) {
1104                 panic("rw lock count underflow for thread %p", thread);
1105         }
1106 #endif
1107         if ((prior_lock_state & LCK_RW_W_WAITING) &&
1108             ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1109                 /*
1110                  *      Someone else has requested upgrade.
1111                  *      Since we've released the read lock, wake
1112                  *      him up if he's blocked waiting
1113                  */
1114                 thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1115         }
1116
1117         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1118                 /* sched_flags checked without lock, but will be rechecked while clearing */
1119                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1120         }
1121
1122         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1123             VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1124
1125         return FALSE;
1126 }
1127
1128 /*
1129  *      Routine:        lck_rw_lock_shared_to_exclusive_success
1130  *      Function:
1131  *              assembly fast path code has already dropped our read
1132  *              count and successfully acquired 'lck_rw_want_upgrade'
1133  *              we just need to wait for the rest of the readers to drain
1134  *              and then we can return as the exclusive holder of this lock
1135  */
1136 static boolean_t
1137 lck_rw_lock_shared_to_exclusive_success(
1138         lck_rw_t        *lock)
1139 {
1140         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1141         int                     slept = 0;
1142         lck_rw_word_t           word;
1143         wait_result_t           res;
1144         boolean_t               istate;
1145         boolean_t               not_shared;
1146
1147 #if     CONFIG_DTRACE
1148         uint64_t                wait_interval = 0;
1149         int                     readers_at_sleep = 0;
1150         boolean_t               dtrace_ls_initialized = FALSE;
1151         boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1152 #endif
1153
1154         while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE)) {
1155                 word.data = ordered_load_rw(lock);
1156 #if     CONFIG_DTRACE
1157                 if (dtrace_ls_initialized == FALSE) {
1158                         dtrace_ls_initialized = TRUE;
1159                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1160                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1161                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1162                         if (dtrace_ls_enabled) {
1163                                 /*
1164                                  * Either sleeping or spinning is happening,
1165                                  *  start a timing of our delay interval now.
1166                                  */
1167                                 readers_at_sleep = word.shared_count;
1168                                 wait_interval = mach_absolute_time();
1169                         }
1170                 }
1171 #endif
1172
1173                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1174                     trace_lck, word.shared_count, 0, 0, 0);
1175
1176                 not_shared = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE);
1177
1178                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1179                     trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1180
1181                 if (not_shared) {
1182                         break;
1183                 }
1184
1185                 /*
1186                  * if we get here, the spin deadline in lck_rw_wait_on_status()
1187                  * has expired w/o the rw_shared_count having drained to 0
1188                  * check to see if we're allowed to do a thread_block
1189                  */
1190                 if (word.can_sleep) {
1191                         istate = lck_interlock_lock(lock);
1192
1193                         word.data = ordered_load_rw(lock);
1194                         if (word.shared_count != 0) {
1195                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1196                                     trace_lck, word.shared_count, 0, 0, 0);
1197
1198                                 word.w_waiting = 1;
1199                                 ordered_store_rw(lock, word.data);
1200
1201                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1202                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1203                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1204                                 lck_interlock_unlock(lock, istate);
1205
1206                                 if (res == THREAD_WAITING) {
1207                                         res = thread_block(THREAD_CONTINUE_NULL);
1208                                         slept++;
1209                                 }
1210                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1211                                     trace_lck, res, slept, 0, 0);
1212                         } else {
1213                                 lck_interlock_unlock(lock, istate);
1214                                 break;
1215                         }
1216                 }
1217         }
1218 #if     CONFIG_DTRACE
1219         /*
1220          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1221          */
1222         if (dtrace_ls_enabled == TRUE) {
1223                 if (slept == 0) {
1224                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1225                 } else {
1226                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1227                             mach_absolute_time() - wait_interval, 1,
1228                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1229                 }
1230         }
1231         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1232 #endif
1233         return TRUE;
1234 }
1235
1236
1237 /*
1238  *      Routine:        lck_rw_lock_exclusive_to_shared
1239  */
1240
1241 void
1242 lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1243 {
1244         uint32_t        data, prev;
1245
1246         assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1247         ordered_store_rw_owner(lock, THREAD_NULL);
1248         for (;;) {
1249                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1250                 if (data & LCK_RW_INTERLOCK) {
1251 #if __SMP__
1252                         atomic_exchange_abort();
1253                         lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1254                         continue;
1255 #else
1256                         panic("lck_rw_lock_exclusive_to_shared(): Interlock locked (%p): %x", lock, data);
1257 #endif // __SMP__
1258                 }
1259                 data += LCK_RW_SHARED_READER;
1260                 if (data & LCK_RW_WANT_UPGRADE) {
1261                         data &= ~(LCK_RW_WANT_UPGRADE);
1262                 } else {
1263                         data &= ~(LCK_RW_WANT_EXCL);
1264                 }
1265                 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1266                         data &= ~(LCK_RW_W_WAITING);
1267                 }
1268                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1269                         break;
1270                 }
1271                 cpu_pause();
1272         }
1273         return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1274 }
1275
1276 /*
1277  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1278  *      Function:
1279  *              Fast path has already dropped
1280  *              our exclusive state and bumped lck_rw_shared_count
1281  *              all we need to do here is determine if anyone
1282  *              needs to be awakened.
1283  */
1284 static void
1285 lck_rw_lock_exclusive_to_shared_gen(
1286         lck_rw_t        *lck,
1287         uint32_t        prior_lock_state)
1288 {
1289         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1290         lck_rw_word_t   fake_lck;
1291
1292         /*
1293          * prior_lock state is a snapshot of the 1st word of the
1294          * lock in question... we'll fake up a pointer to it
1295          * and carefully not access anything beyond whats defined
1296          * in the first word of a lck_rw_t
1297          */
1298         fake_lck.data = prior_lock_state;
1299
1300         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1301             trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1302
1303         /*
1304          * don't wake up anyone waiting to take the lock exclusively
1305          * since we hold a read count... when the read count drops to 0,
1306          * the writers will be woken.
1307          *
1308          * wake up any waiting readers if we don't have any writers waiting,
1309          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1310          */
1311         if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1312                 thread_wakeup(LCK_RW_READER_EVENT(lck));
1313         }
1314
1315         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1316             trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1317
1318 #if CONFIG_DTRACE
1319         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1320 #endif
1321 }
1322
1323
1324 /*
1325  *      Routine:        lck_rw_try_lock
1326  */
1327 boolean_t
1328 lck_rw_try_lock(
1329         lck_rw_t                *lck,
1330         lck_rw_type_t   lck_rw_type)
1331 {
1332         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1333                 return lck_rw_try_lock_shared(lck);
1334         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1335                 return lck_rw_try_lock_exclusive(lck);
1336         } else {
1337                 panic("lck_rw_try_lock(): Invalid rw lock type: %x", lck_rw_type);
1338         }
1339         return FALSE;
1340 }
1341
1342 /*
1343  *      Routine:        lck_rw_try_lock_shared
1344  */
1345
1346 boolean_t
1347 lck_rw_try_lock_shared(lck_rw_t *lock)
1348 {
1349         uint32_t        data, prev;
1350
1351         for (;;) {
1352                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1353                 if (data & LCK_RW_INTERLOCK) {
1354 #if __SMP__
1355                         atomic_exchange_abort();
1356                         lck_rw_interlock_spin(lock);
1357                         continue;
1358 #else
1359                         panic("lck_rw_try_lock_shared(): Interlock locked (%p): %x", lock, data);
1360 #endif
1361                 }
1362                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1363                         atomic_exchange_abort();
1364                         return FALSE;                                           /* lock is busy */
1365                 }
1366                 data += LCK_RW_SHARED_READER;                   /* Increment reader refcount */
1367                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1368                         break;
1369                 }
1370                 cpu_pause();
1371         }
1372 #if MACH_ASSERT
1373         thread_t owner = ordered_load_rw_owner(lock);
1374         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1375 #endif
1376         current_thread()->rwlock_count++;
1377 #if     CONFIG_DTRACE
1378         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1379 #endif  /* CONFIG_DTRACE */
1380         return TRUE;
1381 }
1382
1383
1384 /*
1385  *      Routine:        lck_rw_try_lock_exclusive
1386  */
1387
1388 boolean_t
1389 lck_rw_try_lock_exclusive(lck_rw_t *lock)
1390 {
1391         uint32_t        data, prev;
1392         thread_t        thread;
1393
1394         for (;;) {
1395                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1396                 if (data & LCK_RW_INTERLOCK) {
1397 #if __SMP__
1398                         atomic_exchange_abort();
1399                         lck_rw_interlock_spin(lock);
1400                         continue;
1401 #else
1402                         panic("lck_rw_try_lock_exclusive(): Interlock locked (%p): %x", lock, data);
1403 #endif
1404                 }
1405                 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1406                         atomic_exchange_abort();
1407                         return FALSE;
1408                 }
1409                 data |= LCK_RW_WANT_EXCL;
1410                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1411                         break;
1412                 }
1413                 cpu_pause();
1414         }
1415         thread = current_thread();
1416         thread->rwlock_count++;
1417 #if MACH_ASSERT
1418         thread_t owner = ordered_load_rw_owner(lock);
1419         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1420 #endif
1421         ordered_store_rw_owner(lock, thread);
1422 #if     CONFIG_DTRACE
1423         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1424 #endif  /* CONFIG_DTRACE */
1425         return TRUE;
1426 }
1427
1428
1429 /*
1430  *      Routine:        lck_rw_unlock
1431  */
1432 void
1433 lck_rw_unlock(
1434         lck_rw_t                *lck,
1435         lck_rw_type_t   lck_rw_type)
1436 {
1437         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1438                 lck_rw_unlock_shared(lck);
1439         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1440                 lck_rw_unlock_exclusive(lck);
1441         } else {
1442                 panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
1443         }
1444 }
1445
1446
1447 /*
1448  *      Routine:        lck_rw_unlock_shared
1449  */
1450 void
1451 lck_rw_unlock_shared(
1452         lck_rw_t        *lck)
1453 {
1454         lck_rw_type_t   ret;
1455
1456         assertf(lck->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1457         assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
1458         ret = lck_rw_done(lck);
1459
1460         if (ret != LCK_RW_TYPE_SHARED) {
1461                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
1462         }
1463 }
1464
1465
1466 /*
1467  *      Routine:        lck_rw_unlock_exclusive
1468  */
1469 void
1470 lck_rw_unlock_exclusive(
1471         lck_rw_t        *lck)
1472 {
1473         lck_rw_type_t   ret;
1474
1475         assertf(lck->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1476         ret = lck_rw_done(lck);
1477
1478         if (ret != LCK_RW_TYPE_EXCLUSIVE) {
1479                 panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
1480         }
1481 }
1482
1483
1484 /*
1485  *      Routine:        lck_rw_lock_exclusive_gen
1486  */
1487 static void
1488 lck_rw_lock_exclusive_gen(
1489         lck_rw_t        *lock)
1490 {
1491         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1492         lck_rw_word_t           word;
1493         int                     slept = 0;
1494         boolean_t               gotlock = 0;
1495         boolean_t               not_shared_or_upgrade = 0;
1496         wait_result_t           res = 0;
1497         boolean_t               istate;
1498
1499 #if     CONFIG_DTRACE
1500         boolean_t dtrace_ls_initialized = FALSE;
1501         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
1502         uint64_t wait_interval = 0;
1503         int readers_at_sleep = 0;
1504 #endif
1505
1506         /*
1507          *      Try to acquire the lck_rw_want_excl bit.
1508          */
1509         while (!lck_rw_grab(lock, LCK_RW_GRAB_WANT, FALSE)) {
1510 #if     CONFIG_DTRACE
1511                 if (dtrace_ls_initialized == FALSE) {
1512                         dtrace_ls_initialized = TRUE;
1513                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1514                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1515                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1516                         if (dtrace_ls_enabled) {
1517                                 /*
1518                                  * Either sleeping or spinning is happening,
1519                                  *  start a timing of our delay interval now.
1520                                  */
1521                                 readers_at_sleep = lock->lck_rw_shared_count;
1522                                 wait_interval = mach_absolute_time();
1523                         }
1524                 }
1525 #endif
1526
1527                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1528
1529                 gotlock = lck_rw_grab(lock, LCK_RW_GRAB_WANT, TRUE);
1530
1531                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1532
1533                 if (gotlock) {
1534                         break;
1535                 }
1536                 /*
1537                  * if we get here, the deadline has expired w/o us
1538                  * being able to grab the lock exclusively
1539                  * check to see if we're allowed to do a thread_block
1540                  */
1541                 word.data = ordered_load_rw(lock);
1542                 if (word.can_sleep) {
1543                         istate = lck_interlock_lock(lock);
1544                         word.data = ordered_load_rw(lock);
1545
1546                         if (word.want_excl) {
1547                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1548
1549                                 word.w_waiting = 1;
1550                                 ordered_store_rw(lock, word.data);
1551
1552                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1553                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1554                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1555                                 lck_interlock_unlock(lock, istate);
1556
1557                                 if (res == THREAD_WAITING) {
1558                                         res = thread_block(THREAD_CONTINUE_NULL);
1559                                         slept++;
1560                                 }
1561                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1562                         } else {
1563                                 word.want_excl = 1;
1564                                 ordered_store_rw(lock, word.data);
1565                                 lck_interlock_unlock(lock, istate);
1566                                 break;
1567                         }
1568                 }
1569         }
1570         /*
1571          * Wait for readers (and upgrades) to finish...
1572          */
1573         while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE)) {
1574 #if     CONFIG_DTRACE
1575                 /*
1576                  * Either sleeping or spinning is happening, start
1577                  * a timing of our delay interval now.  If we set it
1578                  * to -1 we don't have accurate data so we cannot later
1579                  * decide to record a dtrace spin or sleep event.
1580                  */
1581                 if (dtrace_ls_initialized == FALSE) {
1582                         dtrace_ls_initialized = TRUE;
1583                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1584                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1585                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1586                         if (dtrace_ls_enabled) {
1587                                 /*
1588                                  * Either sleeping or spinning is happening,
1589                                  *  start a timing of our delay interval now.
1590                                  */
1591                                 readers_at_sleep = lock->lck_rw_shared_count;
1592                                 wait_interval = mach_absolute_time();
1593                         }
1594                 }
1595 #endif
1596
1597                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1598
1599                 not_shared_or_upgrade = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE);
1600
1601                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, not_shared_or_upgrade, 0);
1602
1603                 if (not_shared_or_upgrade) {
1604                         break;
1605                 }
1606                 /*
1607                  * if we get here, the deadline has expired w/o us
1608                  * being able to grab the lock exclusively
1609                  * check to see if we're allowed to do a thread_block
1610                  */
1611                 word.data = ordered_load_rw(lock);
1612                 if (word.can_sleep) {
1613                         istate = lck_interlock_lock(lock);
1614                         word.data = ordered_load_rw(lock);
1615
1616                         if (word.shared_count != 0 || word.want_upgrade) {
1617                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1618
1619                                 word.w_waiting = 1;
1620                                 ordered_store_rw(lock, word.data);
1621
1622                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1623                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1624                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1625                                 lck_interlock_unlock(lock, istate);
1626
1627                                 if (res == THREAD_WAITING) {
1628                                         res = thread_block(THREAD_CONTINUE_NULL);
1629                                         slept++;
1630                                 }
1631                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1632                         } else {
1633                                 lck_interlock_unlock(lock, istate);
1634                                 /*
1635                                  * must own the lock now, since we checked for
1636                                  * readers or upgrade owner behind the interlock
1637                                  * no need for a call to 'lck_rw_drain_status'
1638                                  */
1639                                 break;
1640                         }
1641                 }
1642         }
1643
1644 #if     CONFIG_DTRACE
1645         /*
1646          * Decide what latencies we suffered that are Dtrace events.
1647          * If we have set wait_interval, then we either spun or slept.
1648          * At least we get out from under the interlock before we record
1649          * which is the best we can do here to minimize the impact
1650          * of the tracing.
1651          * If we have set wait_interval to -1, then dtrace was not enabled when we
1652          * started sleeping/spinning so we don't record this event.
1653          */
1654         if (dtrace_ls_enabled == TRUE) {
1655                 if (slept == 0) {
1656                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1657                             mach_absolute_time() - wait_interval, 1);
1658                 } else {
1659                         /*
1660                          * For the blocking case, we also record if when we blocked
1661                          * it was held for read or write, and how many readers.
1662                          * Notice that above we recorded this before we dropped
1663                          * the interlock so the count is accurate.
1664                          */
1665                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1666                             mach_absolute_time() - wait_interval, 1,
1667                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1668                 }
1669         }
1670         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1671 #endif  /* CONFIG_DTRACE */
1672 }
1673
1674 /*
1675  *      Routine:        lck_rw_done
1676  */
1677
1678 lck_rw_type_t
1679 lck_rw_done(lck_rw_t *lock)
1680 {
1681         uint32_t        data, prev;
1682         boolean_t       once = FALSE;
1683
1684         for (;;) {
1685                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1686                 if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
1687 #if __SMP__
1688                         atomic_exchange_abort();
1689                         lck_rw_interlock_spin(lock);
1690                         continue;
1691 #else
1692                         panic("lck_rw_done(): Interlock locked (%p): %x", lock, data);
1693 #endif // __SMP__
1694                 }
1695                 if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
1696                         assertf(lock->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1697                         data -= LCK_RW_SHARED_READER;
1698                         if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
1699                                 goto check_waiters;
1700                         }
1701                 } else {                                        /* if reader count == 0, must be exclusive lock */
1702                         if (data & LCK_RW_WANT_UPGRADE) {
1703                                 data &= ~(LCK_RW_WANT_UPGRADE);
1704                         } else {
1705                                 if (data & LCK_RW_WANT_EXCL) {
1706                                         data &= ~(LCK_RW_WANT_EXCL);
1707                                 } else {                                /* lock is not 'owned', panic */
1708                                         panic("Releasing non-exclusive RW lock without a reader refcount!");
1709                                 }
1710                         }
1711                         if (!once) {
1712                                 // Only check for holder and clear it once
1713                                 assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1714                                 ordered_store_rw_owner(lock, THREAD_NULL);
1715                                 once = TRUE;
1716                         }
1717 check_waiters:
1718                         /*
1719                          * test the original values to match what
1720                          * lck_rw_done_gen is going to do to determine
1721                          * which wakeups need to happen...
1722                          *
1723                          * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
1724                          */
1725                         if (prev & LCK_RW_W_WAITING) {
1726                                 data &= ~(LCK_RW_W_WAITING);
1727                                 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
1728                                         data &= ~(LCK_RW_R_WAITING);
1729                                 }
1730                         } else {
1731                                 data &= ~(LCK_RW_R_WAITING);
1732                         }
1733                 }
1734                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1735                         break;
1736                 }
1737                 cpu_pause();
1738         }
1739         return lck_rw_done_gen(lock, prev);
1740 }
1741
1742 /*
1743  *      Routine:        lck_rw_done_gen
1744  *
1745  *      called from the assembly language wrapper...
1746  *      prior_lock_state is the value in the 1st
1747  *      word of the lock at the time of a successful
1748  *      atomic compare and exchange with the new value...
1749  *      it represents the state of the lock before we
1750  *      decremented the rw_shared_count or cleared either
1751  *      rw_want_upgrade or rw_want_write and
1752  *      the lck_x_waiting bits...  since the wrapper
1753  *      routine has already changed the state atomically,
1754  *      we just need to decide if we should
1755  *      wake up anyone and what value to return... we do
1756  *      this by examining the state of the lock before
1757  *      we changed it
1758  */
1759 static lck_rw_type_t
1760 lck_rw_done_gen(
1761         lck_rw_t        *lck,
1762         uint32_t        prior_lock_state)
1763 {
1764         lck_rw_word_t   fake_lck;
1765         lck_rw_type_t   lock_type;
1766         thread_t                thread;
1767         uint32_t                rwlock_count;
1768
1769         /*
1770          * prior_lock state is a snapshot of the 1st word of the
1771          * lock in question... we'll fake up a pointer to it
1772          * and carefully not access anything beyond whats defined
1773          * in the first word of a lck_rw_t
1774          */
1775         fake_lck.data = prior_lock_state;
1776
1777         if (fake_lck.shared_count <= 1) {
1778                 if (fake_lck.w_waiting) {
1779                         thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1780                 }
1781
1782                 if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1783                         thread_wakeup(LCK_RW_READER_EVENT(lck));
1784                 }
1785         }
1786         if (fake_lck.shared_count) {
1787                 lock_type = LCK_RW_TYPE_SHARED;
1788         } else {
1789                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1790         }
1791
1792         /* Check if dropping the lock means that we need to unpromote */
1793         thread = current_thread();
1794         rwlock_count = thread->rwlock_count--;
1795 #if MACH_LDEBUG
1796         if (rwlock_count == 0) {
1797                 panic("rw lock count underflow for thread %p", thread);
1798         }
1799 #endif
1800         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1801                 /* sched_flags checked without lock, but will be rechecked while clearing */
1802                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1803         }
1804 #if CONFIG_DTRACE
1805         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1806 #endif
1807         return lock_type;
1808 }
1809
1810 /*
1811  *      Routine:        lck_rw_lock_shared_gen
1812  *      Function:
1813  *              Fast path code has determined that this lock
1814  *              is held exclusively... this is where we spin/block
1815  *              until we can acquire the lock in the shared mode
1816  */
1817 static void
1818 lck_rw_lock_shared_gen(
1819         lck_rw_t        *lck)
1820 {
1821         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1822         lck_rw_word_t           word;
1823         boolean_t               gotlock = 0;
1824         int                     slept = 0;
1825         wait_result_t           res = 0;
1826         boolean_t               istate;
1827
1828 #if     CONFIG_DTRACE
1829         uint64_t wait_interval = 0;
1830         int readers_at_sleep = 0;
1831         boolean_t dtrace_ls_initialized = FALSE;
1832         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1833 #endif /* CONFIG_DTRACE */
1834
1835         while (!lck_rw_grab(lck, LCK_RW_GRAB_SHARED, FALSE)) {
1836 #if     CONFIG_DTRACE
1837                 if (dtrace_ls_initialized == FALSE) {
1838                         dtrace_ls_initialized = TRUE;
1839                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1840                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1841                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1842                         if (dtrace_ls_enabled) {
1843                                 /*
1844                                  * Either sleeping or spinning is happening,
1845                                  *  start a timing of our delay interval now.
1846                                  */
1847                                 readers_at_sleep = lck->lck_rw_shared_count;
1848                                 wait_interval = mach_absolute_time();
1849                         }
1850                 }
1851 #endif
1852
1853                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1854                     trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1855
1856                 gotlock = lck_rw_grab(lck, LCK_RW_GRAB_SHARED, TRUE);
1857
1858                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1859                     trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, gotlock, 0);
1860
1861                 if (gotlock) {
1862                         break;
1863                 }
1864                 /*
1865                  * if we get here, the deadline has expired w/o us
1866                  * being able to grab the lock for read
1867                  * check to see if we're allowed to do a thread_block
1868                  */
1869                 if (lck->lck_rw_can_sleep) {
1870                         istate = lck_interlock_lock(lck);
1871
1872                         word.data = ordered_load_rw(lck);
1873                         if ((word.want_excl || word.want_upgrade) &&
1874                             ((word.shared_count == 0) || word.priv_excl)) {
1875                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1876                                     trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1877
1878                                 word.r_waiting = 1;
1879                                 ordered_store_rw(lck, word.data);
1880
1881                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1882                                 res = assert_wait(LCK_RW_READER_EVENT(lck),
1883                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1884                                 lck_interlock_unlock(lck, istate);
1885
1886                                 if (res == THREAD_WAITING) {
1887                                         res = thread_block(THREAD_CONTINUE_NULL);
1888                                         slept++;
1889                                 }
1890                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1891                                     trace_lck, res, slept, 0, 0);
1892                         } else {
1893                                 word.shared_count++;
1894                                 ordered_store_rw(lck, word.data);
1895                                 lck_interlock_unlock(lck, istate);
1896                                 break;
1897                         }
1898                 }
1899         }
1900
1901 #if     CONFIG_DTRACE
1902         if (dtrace_ls_enabled == TRUE) {
1903                 if (slept == 0) {
1904                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1905                 } else {
1906                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1907                             mach_absolute_time() - wait_interval, 0,
1908                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1909                 }
1910         }
1911         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1912 #endif  /* CONFIG_DTRACE */
1913 }
1914
1915
1916 void
1917 lck_rw_assert(
1918         lck_rw_t                *lck,
1919         unsigned int    type)
1920 {
1921         switch (type) {
1922         case LCK_RW_ASSERT_SHARED:
1923                 if ((lck->lck_rw_shared_count != 0) &&
1924                     (lck->lck_rw_owner == THREAD_NULL)) {
1925                         return;
1926                 }
1927                 break;
1928         case LCK_RW_ASSERT_EXCLUSIVE:
1929                 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1930                     (lck->lck_rw_shared_count == 0) &&
1931                     (lck->lck_rw_owner == current_thread())) {
1932                         return;
1933                 }
1934                 break;
1935         case LCK_RW_ASSERT_HELD:
1936                 if (lck->lck_rw_shared_count != 0) {
1937                         return;         // Held shared
1938                 }
1939                 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1940                     (lck->lck_rw_owner == current_thread())) {
1941                         return;         // Held exclusive
1942                 }
1943                 break;
1944         case LCK_RW_ASSERT_NOTHELD:
1945                 if ((lck->lck_rw_shared_count == 0) &&
1946                     !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1947                     (lck->lck_rw_owner == THREAD_NULL)) {
1948                         return;
1949                 }
1950                 break;
1951         default:
1952                 break;
1953         }
1954         panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
1955 }
1956
1957
1958 /*
1959  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1960  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1961  */
1962 boolean_t
1963 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
1964 {
1965         if (not_in_kdp) {
1966                 panic("panic: rw lock exclusive check done outside of kernel debugger");
1967         }
1968         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
1969 }
1970
1971 /*
1972  * The C portion of the mutex package.  These routines are only invoked
1973  * if the optimized assembler routines can't do the work.
1974  */
1975
1976 /*
1977  * Forward declaration
1978  */
1979
1980 void
1981 lck_mtx_ext_init(
1982         lck_mtx_ext_t * lck,
1983         lck_grp_t * grp,
1984         lck_attr_t * attr);
1985
1986 /*
1987  *      Routine:        lck_mtx_alloc_init
1988  */
1989 lck_mtx_t      *
1990 lck_mtx_alloc_init(
1991         lck_grp_t * grp,
1992         lck_attr_t * attr)
1993 {
1994         lck_mtx_t      *lck;
1995
1996         if ((lck = (lck_mtx_t *) kalloc(sizeof(lck_mtx_t))) != 0) {
1997                 lck_mtx_init(lck, grp, attr);
1998         }
1999
2000         return lck;
2001 }
2002
2003 /*
2004  *      Routine:        lck_mtx_free
2005  */
2006 void
2007 lck_mtx_free(
2008         lck_mtx_t * lck,
2009         lck_grp_t * grp)
2010 {
2011         lck_mtx_destroy(lck, grp);
2012         kfree(lck, sizeof(lck_mtx_t));
2013 }
2014
2015 /*
2016  *      Routine:        lck_mtx_init
2017  */
2018 void
2019 lck_mtx_init(
2020         lck_mtx_t * lck,
2021         lck_grp_t * grp,
2022         lck_attr_t * attr)
2023 {
2024 #ifdef  BER_XXX
2025         lck_mtx_ext_t  *lck_ext;
2026 #endif
2027         lck_attr_t     *lck_attr;
2028
2029         if (attr != LCK_ATTR_NULL) {
2030                 lck_attr = attr;
2031         } else {
2032                 lck_attr = &LockDefaultLckAttr;
2033         }
2034
2035 #ifdef  BER_XXX
2036         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2037                 if ((lck_ext = (lck_mtx_ext_t *) kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2038                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
2039                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2040                         lck->lck_mtx_ptr = lck_ext;
2041                         lck->lck_mtx_type = LCK_MTX_TYPE;
2042                 }
2043         } else
2044 #endif
2045         {
2046                 lck->lck_mtx_ptr = NULL;                // Clear any padding in the union fields below
2047                 lck->lck_mtx_waiters = 0;
2048                 lck->lck_mtx_type = LCK_MTX_TYPE;
2049                 ordered_store_mtx(lck, 0);
2050         }
2051         lck_grp_reference(grp);
2052         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2053 }
2054
2055 /*
2056  *      Routine:        lck_mtx_init_ext
2057  */
2058 void
2059 lck_mtx_init_ext(
2060         lck_mtx_t * lck,
2061         lck_mtx_ext_t * lck_ext,
2062         lck_grp_t * grp,
2063         lck_attr_t * attr)
2064 {
2065         lck_attr_t     *lck_attr;
2066
2067         if (attr != LCK_ATTR_NULL) {
2068                 lck_attr = attr;
2069         } else {
2070                 lck_attr = &LockDefaultLckAttr;
2071         }
2072
2073         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2074                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2075                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2076                 lck->lck_mtx_ptr = lck_ext;
2077                 lck->lck_mtx_type = LCK_MTX_TYPE;
2078         } else {
2079                 lck->lck_mtx_waiters = 0;
2080                 lck->lck_mtx_type = LCK_MTX_TYPE;
2081                 ordered_store_mtx(lck, 0);
2082         }
2083         lck_grp_reference(grp);
2084         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2085 }
2086
2087 /*
2088  *      Routine:        lck_mtx_ext_init
2089  */
2090 void
2091 lck_mtx_ext_init(
2092         lck_mtx_ext_t * lck,
2093         lck_grp_t * grp,
2094         lck_attr_t * attr)
2095 {
2096         bzero((void *) lck, sizeof(lck_mtx_ext_t));
2097
2098         lck->lck_mtx.lck_mtx_type = LCK_MTX_TYPE;
2099
2100         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2101                 lck->lck_mtx_deb.type = MUTEX_TAG;
2102                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2103         }
2104         lck->lck_mtx_grp = grp;
2105
2106         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
2107                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2108         }
2109 }
2110
2111 /* The slow versions */
2112 static void lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2113 static boolean_t lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread);
2114 static void lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2115
2116 /* The adaptive spin function */
2117 static spinwait_result_t lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2118
2119 /*
2120  *      Routine:        lck_mtx_verify
2121  *
2122  *      Verify if a mutex is valid
2123  */
2124 static inline void
2125 lck_mtx_verify(lck_mtx_t *lock)
2126 {
2127         if (lock->lck_mtx_type != LCK_MTX_TYPE) {
2128                 panic("Invalid mutex %p", lock);
2129         }
2130 #if     DEVELOPMENT || DEBUG
2131         if (lock->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2132                 panic("Mutex destroyed %p", lock);
2133         }
2134 #endif  /* DEVELOPMENT || DEBUG */
2135 }
2136
2137 /*
2138  *      Routine:        lck_mtx_check_preemption
2139  *
2140  *      Verify preemption is enabled when attempting to acquire a mutex.
2141  */
2142
2143 static inline void
2144 lck_mtx_check_preemption(lck_mtx_t *lock)
2145 {
2146 #if     DEVELOPMENT || DEBUG
2147         int pl = get_preemption_level();
2148
2149         if (pl != 0) {
2150                 panic("Attempt to take mutex with preemption disabled. Lock=%p, level=%d", lock, pl);
2151         }
2152 #else
2153         (void)lock;
2154 #endif
2155 }
2156
2157 /*
2158  *      Routine:        lck_mtx_lock
2159  */
2160 void
2161 lck_mtx_lock(lck_mtx_t *lock)
2162 {
2163         thread_t        thread;
2164
2165         lck_mtx_verify(lock);
2166         lck_mtx_check_preemption(lock);
2167         thread = current_thread();
2168         if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2169             0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2170 #if     CONFIG_DTRACE
2171                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2172 #endif /* CONFIG_DTRACE */
2173                 return;
2174         }
2175         lck_mtx_lock_contended(lock, thread, FALSE);
2176 }
2177
2178 /*
2179  *       This is the slow version of mutex locking.
2180  */
2181 static void NOINLINE
2182 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2183 {
2184         thread_t                holding_thread;
2185         uintptr_t               state;
2186         int                     waiters = 0;
2187         spinwait_result_t       sw_res;
2188         struct turnstile        *ts = NULL;
2189
2190         /* Loop waiting until I see that the mutex is unowned */
2191         for (;;) {
2192                 sw_res = lck_mtx_lock_contended_spinwait_arm(lock, thread, interlocked);
2193                 interlocked = FALSE;
2194
2195                 switch (sw_res) {
2196                 case SPINWAIT_ACQUIRED:
2197                         if (ts != NULL) {
2198                                 interlock_lock(lock);
2199                                 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2200                                 interlock_unlock(lock);
2201                         }
2202                         goto done;
2203                 case SPINWAIT_INTERLOCK:
2204                         goto set_owner;
2205                 default:
2206                         break;
2207                 }
2208
2209                 state = ordered_load_mtx(lock);
2210                 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2211                 if (holding_thread == NULL) {
2212                         break;
2213                 }
2214                 ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
2215                 lck_mtx_lock_wait(lock, holding_thread, &ts);
2216                 /* returns interlock unlocked */
2217         }
2218
2219 set_owner:
2220         /* Hooray, I'm the new owner! */
2221         state = ordered_load_mtx(lock);
2222
2223         if (state & ARM_LCK_WAITERS) {
2224                 /* Skip lck_mtx_lock_acquire if there are no waiters. */
2225                 waiters = lck_mtx_lock_acquire(lock, ts);
2226                 /*
2227                  * lck_mtx_lock_acquire will call
2228                  * turnstile_complete
2229                  */
2230         } else {
2231                 if (ts != NULL) {
2232                         turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2233                 }
2234         }
2235
2236         state = LCK_MTX_THREAD_TO_STATE(thread);
2237         if (waiters != 0) {
2238                 state |= ARM_LCK_WAITERS;
2239         }
2240 #if __SMP__
2241         state |= LCK_ILOCK;                             // Preserve interlock
2242         ordered_store_mtx(lock, state); // Set ownership
2243         interlock_unlock(lock);                 // Release interlock, enable preemption
2244 #else
2245         ordered_store_mtx(lock, state); // Set ownership
2246         enable_preemption();
2247 #endif
2248
2249 done:
2250         load_memory_barrier();
2251
2252         assert(thread->turnstile != NULL);
2253
2254         if (ts != NULL) {
2255                 turnstile_cleanup();
2256         }
2257
2258 #if CONFIG_DTRACE
2259         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2260 #endif /* CONFIG_DTRACE */
2261 }
2262
2263 /*
2264  * Routine: lck_mtx_lock_spinwait_arm
2265  *
2266  * Invoked trying to acquire a mutex when there is contention but
2267  * the holder is running on another processor. We spin for up to a maximum
2268  * time waiting for the lock to be released.
2269  */
2270 static spinwait_result_t
2271 lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2272 {
2273         int                     has_interlock = (int)interlocked;
2274 #if __SMP__
2275         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
2276         thread_t        owner, prev_owner;
2277         uint64_t        window_deadline, sliding_deadline, high_deadline;
2278         uint64_t        start_time, cur_time, avg_hold_time, bias, delta;
2279         int             loopcount = 0;
2280         uint            i, prev_owner_cpu;
2281         int             total_hold_time_samples, window_hold_time_samples, unfairness;
2282         bool            owner_on_core, adjust;
2283         uintptr_t       state, new_state, waiters;
2284         spinwait_result_t       retval = SPINWAIT_DID_SPIN_HIGH_THR;
2285
2286         if (__improbable(!(lck_mtx_adaptive_spin_mode & ADAPTIVE_SPIN_ENABLE))) {
2287                 if (!has_interlock) {
2288                         interlock_lock(lock);
2289                 }
2290
2291                 return SPINWAIT_DID_NOT_SPIN;
2292         }
2293
2294         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2295             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, 0, 0);
2296
2297         start_time = mach_absolute_time();
2298         /*
2299          * window_deadline represents the "learning" phase.
2300          * The thread collects statistics about the lock during
2301          * window_deadline and then it makes a decision on whether to spin more
2302          * or block according to the concurrency behavior
2303          * observed.
2304          *
2305          * Every thread can spin at least low_MutexSpin.
2306          */
2307         window_deadline = start_time + low_MutexSpin;
2308         /*
2309          * Sliding_deadline is the adjusted spin deadline
2310          * computed after the "learning" phase.
2311          */
2312         sliding_deadline = window_deadline;
2313         /*
2314          * High_deadline is a hard deadline. No thread
2315          * can spin more than this deadline.
2316          */
2317         if (high_MutexSpin >= 0) {
2318                 high_deadline = start_time + high_MutexSpin;
2319         } else {
2320                 high_deadline = start_time + low_MutexSpin * real_ncpus;
2321         }
2322
2323         /*
2324          * Do not know yet which is the owner cpu.
2325          * Initialize prev_owner_cpu with next cpu.
2326          */
2327         prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
2328         total_hold_time_samples = 0;
2329         window_hold_time_samples = 0;
2330         avg_hold_time = 0;
2331         adjust = TRUE;
2332         bias = (os_hash_kernel_pointer(lock) + cpu_number()) % real_ncpus;
2333
2334         /* Snoop the lock state */
2335         state = ordered_load_mtx(lock);
2336         owner = LCK_MTX_STATE_TO_THREAD(state);
2337         prev_owner = owner;
2338
2339         if (has_interlock) {
2340                 if (owner == NULL) {
2341                         retval = SPINWAIT_INTERLOCK;
2342                         goto done_spinning;
2343                 } else {
2344                         /*
2345                          * We are holding the interlock, so
2346                          * we can safely dereference owner.
2347                          */
2348                         if (!(owner->machine.machine_thread_flags & MACHINE_THREAD_FLAGS_ON_CPU) ||
2349                             (owner->state & TH_IDLE)) {
2350                                 retval = SPINWAIT_DID_NOT_SPIN;
2351                                 goto done_spinning;
2352                         }
2353                 }
2354                 interlock_unlock(lock);
2355                 has_interlock = 0;
2356         }
2357
2358         /*
2359          * Spin while:
2360          *   - mutex is locked, and
2361          *   - it's locked as a spin lock, and
2362          *   - owner is running on another processor, and
2363          *   - we haven't spun for long enough.
2364          */
2365         do {
2366                 /*
2367                  * Try to acquire the lock.
2368                  */
2369                 owner = LCK_MTX_STATE_TO_THREAD(state);
2370                 if (owner == NULL) {
2371                         waiters = state & ARM_LCK_WAITERS;
2372                         if (waiters) {
2373                                 /*
2374                                  * preserve the waiter bit
2375                                  * and try acquire the interlock.
2376                                  * Note: we will successfully acquire
2377                                  * the interlock only if we can also
2378                                  * acquire the lock.
2379                                  */
2380                                 new_state = ARM_LCK_WAITERS | LCK_ILOCK;
2381                                 has_interlock = 1;
2382                                 retval = SPINWAIT_INTERLOCK;
2383                                 disable_preemption();
2384                         } else {
2385                                 new_state = LCK_MTX_THREAD_TO_STATE(thread);
2386                                 retval = SPINWAIT_ACQUIRED;
2387                         }
2388
2389                         /*
2390                          * The cmpxchg will succed only if the lock
2391                          * is not owned (doesn't have an owner set)
2392                          * and it is not interlocked.
2393                          * It will not fail if there are waiters.
2394                          */
2395                         if (os_atomic_cmpxchgv(&lock->lck_mtx_data,
2396                             waiters, new_state, &state, acquire)) {
2397                                 goto done_spinning;
2398                         } else {
2399                                 if (waiters) {
2400                                         has_interlock = 0;
2401                                         enable_preemption();
2402                                 }
2403                         }
2404                 }
2405
2406                 cur_time = mach_absolute_time();
2407
2408                 /*
2409                  * Never spin past high_deadline.
2410                  */
2411                 if (cur_time >= high_deadline) {
2412                         retval = SPINWAIT_DID_SPIN_HIGH_THR;
2413                         break;
2414                 }
2415
2416                 /*
2417                  * Check if owner is on core. If not block.
2418                  */
2419                 owner = LCK_MTX_STATE_TO_THREAD(state);
2420                 if (owner) {
2421                         i = prev_owner_cpu;
2422                         owner_on_core = FALSE;
2423
2424                         disable_preemption();
2425                         state = ordered_load_mtx(lock);
2426                         owner = LCK_MTX_STATE_TO_THREAD(state);
2427
2428                         /*
2429                          * For scalability we want to check if the owner is on core
2430                          * without locking the mutex interlock.
2431                          * If we do not lock the mutex interlock, the owner that we see might be
2432                          * invalid, so we cannot dereference it. Therefore we cannot check
2433                          * any field of the thread to tell us if it is on core.
2434                          * Check if the thread that is running on the other cpus matches the owner.
2435                          */
2436                         if (owner) {
2437                                 do {
2438                                         cpu_data_t *cpu_data_ptr = CpuDataEntries[i].cpu_data_vaddr;
2439                                         if ((cpu_data_ptr != NULL) && (cpu_data_ptr->cpu_active_thread == owner)) {
2440                                                 owner_on_core = TRUE;
2441                                                 break;
2442                                         }
2443                                         if (++i >= real_ncpus) {
2444                                                 i = 0;
2445                                         }
2446                                 } while (i != prev_owner_cpu);
2447                                 enable_preemption();
2448
2449                                 if (owner_on_core) {
2450                                         prev_owner_cpu = i;
2451                                 } else {
2452                                         prev_owner = owner;
2453                                         state = ordered_load_mtx(lock);
2454                                         owner = LCK_MTX_STATE_TO_THREAD(state);
2455                                         if (owner == prev_owner) {
2456                                                 /*
2457                                                  * Owner is not on core.
2458                                                  * Stop spinning.
2459                                                  */
2460                                                 if (loopcount == 0) {
2461                                                         retval = SPINWAIT_DID_NOT_SPIN;
2462                                                 } else {
2463                                                         retval = SPINWAIT_DID_SPIN_OWNER_NOT_CORE;
2464                                                 }
2465                                                 break;
2466                                         }
2467                                         /*
2468                                          * Fall through if the owner changed while we were scanning.
2469                                          * The new owner could potentially be on core, so loop
2470                                          * again.
2471                                          */
2472                                 }
2473                         } else {
2474                                 enable_preemption();
2475                         }
2476                 }
2477
2478                 /*
2479                  * Save how many times we see the owner changing.
2480                  * We can roughly estimate the the mutex hold
2481                  * time and the fairness with that.
2482                  */
2483                 if (owner != prev_owner) {
2484                         prev_owner = owner;
2485                         total_hold_time_samples++;
2486                         window_hold_time_samples++;
2487                 }
2488
2489                 /*
2490                  * Learning window expired.
2491                  * Try to adjust the sliding_deadline.
2492                  */
2493                 if (cur_time >= window_deadline) {
2494                         /*
2495                          * If there was not contention during the window
2496                          * stop spinning.
2497                          */
2498                         if (window_hold_time_samples < 1) {
2499                                 retval = SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION;
2500                                 break;
2501                         }
2502
2503                         if (adjust) {
2504                                 /*
2505                                  * For a fair lock, we'd wait for at most (NCPU-1) periods,
2506                                  * but the lock is unfair, so let's try to estimate by how much.
2507                                  */
2508                                 unfairness = total_hold_time_samples / real_ncpus;
2509
2510                                 if (unfairness == 0) {
2511                                         /*
2512                                          * We observed the owner changing `total_hold_time_samples` times which
2513                                          * let us estimate the average hold time of this mutex for the duration
2514                                          * of the spin time.
2515                                          * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
2516                                          *
2517                                          * In this case spin at max avg_hold_time * (real_ncpus - 1)
2518                                          */
2519                                         delta = cur_time - start_time;
2520                                         sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
2521                                 } else {
2522                                         /*
2523                                          * In this case at least one of the other cpus was able to get the lock twice
2524                                          * while I was spinning.
2525                                          * We could spin longer but it won't necessarily help if the system is unfair.
2526                                          * Try to randomize the wait to reduce contention.
2527                                          *
2528                                          * We compute how much time we could potentially spin
2529                                          * and distribute it over the cpus.
2530                                          *
2531                                          * bias is an integer between 0 and real_ncpus.
2532                                          * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
2533                                          */
2534                                         delta = high_deadline - cur_time;
2535                                         sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
2536                                         adjust = FALSE;
2537                                 }
2538                         }
2539
2540                         window_deadline += low_MutexSpin;
2541                         window_hold_time_samples = 0;
2542                 }
2543
2544                 /*
2545                  * Stop spinning if we past
2546                  * the adjusted deadline.
2547                  */
2548                 if (cur_time >= sliding_deadline) {
2549                         retval = SPINWAIT_DID_SPIN_SLIDING_THR;
2550                         break;
2551                 }
2552
2553                 /*
2554                  * We want to arm the monitor for wfe,
2555                  * so load exclusively the lock.
2556                  *
2557                  * NOTE:
2558                  * we rely on the fact that wfe will
2559                  * eventually return even if the cache line
2560                  * is not modified. This way we will keep
2561                  * looping and checking if the deadlines expired.
2562                  */
2563                 state = os_atomic_load_exclusive(&lock->lck_mtx_data, relaxed);
2564                 owner = LCK_MTX_STATE_TO_THREAD(state);
2565                 if (owner != NULL) {
2566                         wait_for_event();
2567                         state = ordered_load_mtx(lock);
2568                 } else {
2569                         atomic_exchange_abort();
2570                 }
2571
2572                 loopcount++;
2573         } while (TRUE);
2574
2575 done_spinning:
2576 #if     CONFIG_DTRACE
2577         /*
2578          * Note that we record a different probe id depending on whether
2579          * this is a direct or indirect mutex.  This allows us to
2580          * penalize only lock groups that have debug/stats enabled
2581          * with dtrace processing if desired.
2582          */
2583         if (__probable(lock->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)) {
2584                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lock,
2585                     mach_absolute_time() - start_time);
2586         } else {
2587                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lock,
2588                     mach_absolute_time() - start_time);
2589         }
2590         /* The lockstat acquire event is recorded by the caller. */
2591 #endif
2592
2593         state = ordered_load_mtx(lock);
2594
2595         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2596             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, retval, 0);
2597 #else /* __SMP__ */
2598         /* Spinwaiting is not useful on UP systems. */
2599 #pragma unused(lock, thread)
2600         int retval = SPINWAIT_DID_NOT_SPIN;
2601 #endif /* __SMP__ */
2602         if ((!has_interlock) && (retval != SPINWAIT_ACQUIRED)) {
2603                 /* We must own either the lock or the interlock on return. */
2604                 interlock_lock(lock);
2605         }
2606
2607         return retval;
2608 }
2609
2610
2611 /*
2612  *      Common code for mutex locking as spinlock
2613  */
2614 static inline void
2615 lck_mtx_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2616 {
2617         uintptr_t       state;
2618
2619         interlock_lock(lock);
2620         state = ordered_load_mtx(lock);
2621         if (LCK_MTX_STATE_TO_THREAD(state)) {
2622                 if (allow_held_as_mutex) {
2623                         lck_mtx_lock_contended(lock, current_thread(), TRUE);
2624                 } else {
2625                         // "Always" variants can never block. If the lock is held and blocking is not allowed
2626                         // then someone is mixing always and non-always calls on the same lock, which is
2627                         // forbidden.
2628                         panic("Attempting to block on a lock taken as spin-always %p", lock);
2629                 }
2630                 return;
2631         }
2632         state &= ARM_LCK_WAITERS;                                               // Preserve waiters bit
2633         state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK);        // Add spin tag and maintain interlock
2634         ordered_store_mtx(lock, state);
2635         load_memory_barrier();
2636
2637 #if     CONFIG_DTRACE
2638         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
2639 #endif /* CONFIG_DTRACE */
2640 }
2641
2642 /*
2643  *      Routine:        lck_mtx_lock_spin
2644  */
2645 void
2646 lck_mtx_lock_spin(lck_mtx_t *lock)
2647 {
2648         lck_mtx_check_preemption(lock);
2649         lck_mtx_lock_spin_internal(lock, TRUE);
2650 }
2651
2652 /*
2653  *      Routine:        lck_mtx_lock_spin_always
2654  */
2655 void
2656 lck_mtx_lock_spin_always(lck_mtx_t *lock)
2657 {
2658         lck_mtx_lock_spin_internal(lock, FALSE);
2659 }
2660
2661 /*
2662  *      Routine:        lck_mtx_try_lock
2663  */
2664 boolean_t
2665 lck_mtx_try_lock(lck_mtx_t *lock)
2666 {
2667         thread_t        thread = current_thread();
2668
2669         lck_mtx_verify(lock);
2670         if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2671             0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2672 #if     CONFIG_DTRACE
2673                 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
2674 #endif /* CONFIG_DTRACE */
2675                 return TRUE;
2676         }
2677         return lck_mtx_try_lock_contended(lock, thread);
2678 }
2679
2680 static boolean_t NOINLINE
2681 lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
2682 {
2683         thread_t        holding_thread;
2684         uintptr_t       state;
2685         int             waiters;
2686
2687 #if     __SMP__
2688         interlock_lock(lock);
2689         state = ordered_load_mtx(lock);
2690         holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2691         if (holding_thread) {
2692                 interlock_unlock(lock);
2693                 return FALSE;
2694         }
2695 #else
2696         disable_preemption_for_thread(thread);
2697         state = ordered_load_mtx(lock);
2698         if (state & LCK_ILOCK) {
2699                 panic("Unexpected interlock set (%p)", lock);
2700         }
2701         holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2702         if (holding_thread) {
2703                 enable_preemption();
2704                 return FALSE;
2705         }
2706         state |= LCK_ILOCK;
2707         ordered_store_mtx(lock, state);
2708 #endif  // __SMP__
2709         waiters = lck_mtx_lock_acquire(lock, NULL);
2710         state = LCK_MTX_THREAD_TO_STATE(thread);
2711         if (waiters != 0) {
2712                 state |= ARM_LCK_WAITERS;
2713         }
2714 #if __SMP__
2715         state |= LCK_ILOCK;                             // Preserve interlock
2716         ordered_store_mtx(lock, state); // Set ownership
2717         interlock_unlock(lock);                 // Release interlock, enable preemption
2718 #else
2719         ordered_store_mtx(lock, state); // Set ownership
2720         enable_preemption();
2721 #endif
2722         load_memory_barrier();
2723
2724         turnstile_cleanup();
2725
2726         return TRUE;
2727 }
2728
2729 static inline boolean_t
2730 lck_mtx_try_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2731 {
2732         uintptr_t       state;
2733
2734         if (!interlock_try(lock)) {
2735                 return FALSE;
2736         }
2737         state = ordered_load_mtx(lock);
2738         if (LCK_MTX_STATE_TO_THREAD(state)) {
2739                 // Lock is held as mutex
2740                 if (allow_held_as_mutex) {
2741                         interlock_unlock(lock);
2742                 } else {
2743                         // "Always" variants can never block. If the lock is held as a normal mutex
2744                         // then someone is mixing always and non-always calls on the same lock, which is
2745                         // forbidden.
2746                         panic("Spin-mutex held as full mutex %p", lock);
2747                 }
2748                 return FALSE;
2749         }
2750         state &= ARM_LCK_WAITERS;                                               // Preserve waiters bit
2751         state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK);        // Add spin tag and maintain interlock
2752         ordered_store_mtx(lock, state);
2753         load_memory_barrier();
2754
2755 #if     CONFIG_DTRACE
2756         LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
2757 #endif /* CONFIG_DTRACE */
2758         return TRUE;
2759 }
2760
2761 /*
2762  *      Routine: lck_mtx_try_lock_spin
2763  */
2764 boolean_t
2765 lck_mtx_try_lock_spin(lck_mtx_t *lock)
2766 {
2767         return lck_mtx_try_lock_spin_internal(lock, TRUE);
2768 }
2769
2770 /*
2771  *      Routine: lck_mtx_try_lock_spin_always
2772  */
2773 boolean_t
2774 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
2775 {
2776         return lck_mtx_try_lock_spin_internal(lock, FALSE);
2777 }
2778
2779
2780
2781 /*
2782  *      Routine:        lck_mtx_unlock
2783  */
2784 void
2785 lck_mtx_unlock(lck_mtx_t *lock)
2786 {
2787         thread_t        thread = current_thread();
2788         uintptr_t       state;
2789         boolean_t       ilk_held = FALSE;
2790
2791         lck_mtx_verify(lock);
2792
2793         state = ordered_load_mtx(lock);
2794         if (state & LCK_ILOCK) {
2795                 if (LCK_MTX_STATE_TO_THREAD(state) == (thread_t)LCK_MTX_SPIN_TAG) {
2796                         ilk_held = TRUE;        // Interlock is held by (presumably) this thread
2797                 }
2798                 goto slow_case;
2799         }
2800         // Locked as a mutex
2801         if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2802             LCK_MTX_THREAD_TO_STATE(thread), 0, release)) {
2803 #if     CONFIG_DTRACE
2804                 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2805 #endif /* CONFIG_DTRACE */
2806                 return;
2807         }
2808 slow_case:
2809         lck_mtx_unlock_contended(lock, thread, ilk_held);
2810 }
2811
2812 static void NOINLINE
2813 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
2814 {
2815         uintptr_t       state;
2816         boolean_t               cleanup = FALSE;
2817
2818         if (ilk_held) {
2819                 state = ordered_load_mtx(lock);
2820         } else {
2821 #if     __SMP__
2822                 interlock_lock(lock);
2823                 state = ordered_load_mtx(lock);
2824                 if (thread != LCK_MTX_STATE_TO_THREAD(state)) {
2825                         panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
2826                 }
2827 #else
2828                 disable_preemption_for_thread(thread);
2829                 state = ordered_load_mtx(lock);
2830                 if (state & LCK_ILOCK) {
2831                         panic("lck_mtx_unlock(): Unexpected interlock set (%p)", lock);
2832                 }
2833                 if (thread != LCK_MTX_STATE_TO_THREAD(state)) {
2834                         panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
2835                 }
2836                 state |= LCK_ILOCK;
2837                 ordered_store_mtx(lock, state);
2838 #endif
2839                 if (state & ARM_LCK_WAITERS) {
2840                         if (lck_mtx_unlock_wakeup(lock, thread)) {
2841                                 state = ARM_LCK_WAITERS;
2842                         } else {
2843                                 state = 0;
2844                         }
2845                         cleanup = TRUE;
2846                         goto unlock;
2847                 }
2848         }
2849         state &= ARM_LCK_WAITERS;   /* Clear state, retain waiters bit */
2850 unlock:
2851 #if __SMP__
2852         state |= LCK_ILOCK;
2853         ordered_store_mtx(lock, state);
2854         interlock_unlock(lock);
2855 #else
2856         ordered_store_mtx(lock, state);
2857         enable_preemption();
2858 #endif
2859         if (cleanup) {
2860                 /*
2861                  * Do not do any turnstile operations outside of this block.
2862                  * lock/unlock is called at early stage of boot with single thread,
2863                  * when turnstile is not yet initialized.
2864                  * Even without contention we can come throught the slow path
2865                  * if the mutex is acquired as a spin lock.
2866                  */
2867                 turnstile_cleanup();
2868         }
2869
2870 #if     CONFIG_DTRACE
2871         LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2872 #endif /* CONFIG_DTRACE */
2873 }
2874
2875 /*
2876  *      Routine:        lck_mtx_assert
2877  */
2878 void
2879 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
2880 {
2881         thread_t        thread, holder;
2882         uintptr_t       state;
2883
2884         state = ordered_load_mtx(lock);
2885         holder = LCK_MTX_STATE_TO_THREAD(state);
2886         if (holder == (thread_t)LCK_MTX_SPIN_TAG) {
2887                 // Lock is held in spin mode, owner is unknown.
2888                 return; // Punt
2889         }
2890         thread = current_thread();
2891         if (type == LCK_MTX_ASSERT_OWNED) {
2892                 if (thread != holder) {
2893                         panic("lck_mtx_assert(): mutex (%p) owned", lock);
2894                 }
2895         } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
2896                 if (thread == holder) {
2897                         panic("lck_mtx_assert(): mutex (%p) not owned", lock);
2898                 }
2899         } else {
2900                 panic("lck_mtx_assert(): invalid arg (%u)", type);
2901         }
2902 }
2903
2904 /*
2905  *      Routine:        lck_mtx_ilk_unlock
2906  */
2907 boolean_t
2908 lck_mtx_ilk_unlock(lck_mtx_t *lock)
2909 {
2910         interlock_unlock(lock);
2911         return TRUE;
2912 }
2913
2914 /*
2915  *      Routine:        lck_mtx_convert_spin
2916  *
2917  *      Convert a mutex held for spin into a held full mutex
2918  */
2919 void
2920 lck_mtx_convert_spin(lck_mtx_t *lock)
2921 {
2922         thread_t        thread = current_thread();
2923         uintptr_t       state;
2924         int                     waiters;
2925
2926         state = ordered_load_mtx(lock);
2927         if (LCK_MTX_STATE_TO_THREAD(state) == thread) {
2928                 return;         // Already owned as mutex, return
2929         }
2930         if ((state & LCK_ILOCK) == 0 || (LCK_MTX_STATE_TO_THREAD(state) != (thread_t)LCK_MTX_SPIN_TAG)) {
2931                 panic("lck_mtx_convert_spin: Not held as spinlock (%p)", lock);
2932         }
2933         state &= ~(LCK_MTX_THREAD_MASK);                // Clear the spin tag
2934         ordered_store_mtx(lock, state);
2935         waiters = lck_mtx_lock_acquire(lock, NULL);   // Acquire to manage priority boosts
2936         state = LCK_MTX_THREAD_TO_STATE(thread);
2937         if (waiters != 0) {
2938                 state |= ARM_LCK_WAITERS;
2939         }
2940 #if __SMP__
2941         state |= LCK_ILOCK;
2942         ordered_store_mtx(lock, state);                 // Set ownership
2943         interlock_unlock(lock);                                 // Release interlock, enable preemption
2944 #else
2945         ordered_store_mtx(lock, state);                 // Set ownership
2946         enable_preemption();
2947 #endif
2948         turnstile_cleanup();
2949 }
2950
2951
2952 /*
2953  *      Routine:        lck_mtx_destroy
2954  */
2955 void
2956 lck_mtx_destroy(
2957         lck_mtx_t * lck,
2958         lck_grp_t * grp)
2959 {
2960         if (lck->lck_mtx_type != LCK_MTX_TYPE) {
2961                 panic("Destroying invalid mutex %p", lck);
2962         }
2963         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2964                 panic("Destroying previously destroyed lock %p", lck);
2965         }
2966         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2967         lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
2968         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2969         lck_grp_deallocate(grp);
2970         return;
2971 }
2972
2973 /*
2974  *      Routine:        lck_spin_assert
2975  */
2976 void
2977 lck_spin_assert(lck_spin_t *lock, unsigned int type)
2978 {
2979         thread_t        thread, holder;
2980         uintptr_t       state;
2981
2982         if (lock->type != LCK_SPIN_TYPE) {
2983                 panic("Invalid spinlock %p", lock);
2984         }
2985
2986         state = lock->lck_spin_data;
2987         holder = (thread_t)(state & ~LCK_ILOCK);
2988         thread = current_thread();
2989         if (type == LCK_ASSERT_OWNED) {
2990                 if (holder == 0) {
2991                         panic("Lock not owned %p = %lx", lock, state);
2992                 }
2993                 if (holder != thread) {
2994                         panic("Lock not owned by current thread %p = %lx", lock, state);
2995                 }
2996                 if ((state & LCK_ILOCK) == 0) {
2997                         panic("Lock bit not set %p = %lx", lock, state);
2998                 }
2999         } else if (type == LCK_ASSERT_NOTOWNED) {
3000                 if (holder != 0) {
3001                         if (holder == thread) {
3002                                 panic("Lock owned by current thread %p = %lx", lock, state);
3003                         }
3004                 }
3005         } else {
3006                 panic("lck_spin_assert(): invalid arg (%u)", type);
3007         }
3008 }
3009
3010 boolean_t
3011 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
3012 {
3013         lck_rw_word_t   word;
3014
3015         lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
3016
3017         word.data = ordered_load_rw(lck);
3018         if (word.want_excl || word.want_upgrade || force_yield) {
3019                 lck_rw_unlock_shared(lck);
3020                 mutex_pause(2);
3021                 lck_rw_lock_shared(lck);
3022                 return TRUE;
3023         }
3024
3025         return FALSE;
3026 }
3027
3028 /*
3029  * Routine: kdp_lck_mtx_lock_spin_is_acquired
3030  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3031  */
3032 boolean_t
3033 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3034 {
3035         uintptr_t       state;
3036
3037         if (not_in_kdp) {
3038                 panic("panic: spinlock acquired check done outside of kernel debugger");
3039         }
3040         state = ordered_load_mtx(lck);
3041         if (state == LCK_MTX_TAG_DESTROYED) {
3042                 return FALSE;
3043         }
3044         if (LCK_MTX_STATE_TO_THREAD(state) || (state & LCK_ILOCK)) {
3045                 return TRUE;
3046         }
3047         return FALSE;
3048 }
3049
3050 void
3051 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3052 {
3053         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3054         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3055         uintptr_t state   = ordered_load_mtx(mutex);
3056         thread_t holder   = LCK_MTX_STATE_TO_THREAD(state);
3057         if ((uintptr_t)holder == (uintptr_t)LCK_MTX_SPIN_TAG) {
3058                 waitinfo->owner = STACKSHOT_WAITOWNER_MTXSPIN;
3059         } else {
3060                 assertf(state != (uintptr_t)LCK_MTX_TAG_DESTROYED, "state=0x%llx", (uint64_t)state);
3061                 assertf(state != (uintptr_t)LCK_MTX_TAG_INDIRECT, "state=0x%llx", (uint64_t)state);
3062                 waitinfo->owner = thread_tid(holder);
3063         }
3064 }
3065
3066 void
3067 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3068 {
3069         lck_rw_t        *rwlck = NULL;
3070         switch (waitinfo->wait_type) {
3071         case kThreadWaitKernelRWLockRead:
3072                 rwlck = READ_EVENT_TO_RWLOCK(event);
3073                 break;
3074         case kThreadWaitKernelRWLockWrite:
3075         case kThreadWaitKernelRWLockUpgrade:
3076                 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3077                 break;
3078         default:
3079                 panic("%s was called with an invalid blocking type", __FUNCTION__);
3080                 break;
3081         }
3082         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3083         waitinfo->owner = thread_tid(rwlck->lck_rw_owner);
3084 }