osfmk/arm/locks_arm.c

   1 /*
   2  * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System Copyright (c) 1991,1990,1989,1988,1987 Carnegie
  33  * Mellon University All Rights Reserved.
  34  *
  35  * Permission to use, copy, modify and distribute this software and its
  36  * documentation is hereby granted, provided that both the copyright notice
  37  * and this permission notice appear in all copies of the software,
  38  * derivative works or modified versions, and any portions thereof, and that
  39  * both notices appear in supporting documentation.
  40  *
  41  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.
  42  * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES
  43  * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  44  *
  45  * Carnegie Mellon requests users of this software to return to
  46  *
  47  * Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  48  * School of Computer Science Carnegie Mellon University Pittsburgh PA
  49  * 15213-3890
  50  *
  51  * any improvements or extensions that they make and grant Carnegie Mellon the
  52  * rights to redistribute these changes.
  53  */
  54 /*
  55  *      File:   kern/lock.c
  56  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  57  *      Date:   1985
  58  *
  59  *      Locking primitives implementation
  60  */
  61
  62 #define LOCK_PRIVATE 1
  63
  64 #include <mach_ldebug.h>
  65
  66 #include <kern/kalloc.h>
  67 #include <kern/lock_stat.h>
  68 #include <kern/locks.h>
  69 #include <kern/misc_protos.h>
  70 #include <kern/thread.h>
  71 #include <kern/processor.h>
  72 #include <kern/sched_prim.h>
  73 #include <kern/debug.h>
  74 #include <kern/kcdata.h>
  75 #include <string.h>
  76
  77 #include <arm/cpu_data_internal.h>
  78 #include <arm/proc_reg.h>
  79 #include <arm/smp.h>
  80 #include <machine/atomic.h>
  81 #include <machine/machine_cpu.h>
  82
  83 #include <sys/kdebug.h>
  84
  85 #if CONFIG_DTRACE
  86 #define DTRACE_RW_SHARED        0x0     //reader
  87 #define DTRACE_RW_EXCL          0x1     //writer
  88 #define DTRACE_NO_FLAG          0x0     //not applicable
  89 #endif  /* CONFIG_DTRACE */
  90
  91 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  92 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
  93 #define LCK_RW_LCK_SHARED_CODE          0x102
  94 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
  95 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
  96 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
  97
  98
  99 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 100
 101 // Panic in tests that check lock usage correctness
 102 // These are undesirable when in a panic or a debugger is runnning.
 103 #define LOCK_CORRECTNESS_PANIC() (kernel_debugger_entry_count == 0)
 104
 105 unsigned int    LcksOpts = 0;
 106
 107 #define ADAPTIVE_SPIN_ENABLE 0x1
 108
 109 #if __SMP__
 110 int lck_mtx_adaptive_spin_mode = ADAPTIVE_SPIN_ENABLE;
 111 #else /* __SMP__ */
 112 int lck_mtx_adaptive_spin_mode = 0;
 113 #endif /* __SMP__ */
 114
 115 #define SPINWAIT_OWNER_CHECK_COUNT 4
 116
 117 typedef enum {
 118         SPINWAIT_ACQUIRED,     /* Got the lock. */
 119         SPINWAIT_INTERLOCK,    /* Got the interlock, no owner, but caller must finish acquiring the lock. */
 120         SPINWAIT_DID_SPIN,     /* Got the interlock, spun, but failed to get the lock. */
 121         SPINWAIT_DID_NOT_SPIN, /* Got the interlock, did not spin. */
 122 } spinwait_result_t;
 123
 124 #if CONFIG_DTRACE && __SMP__
 125 extern uint64_t dtrace_spin_threshold;
 126 #endif
 127
 128 /* Forwards */
 129
 130 extern unsigned int not_in_kdp;
 131
 132 /*
 133  *      We often want to know the addresses of the callers
 134  *      of the various lock routines.  However, this information
 135  *      is only used for debugging and statistics.
 136  */
 137 typedef void   *pc_t;
 138 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 139 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 140
 141 #ifdef  lint
 142 /*
 143  *      Eliminate lint complaints about unused local pc variables.
 144  */
 145 #define OBTAIN_PC(pc, l) ++pc
 146 #else                           /* lint */
 147 #define OBTAIN_PC(pc, l)
 148 #endif                          /* lint */
 149
 150
 151 /*
 152  *      Portable lock package implementation of usimple_locks.
 153  */
 154
 155 /*
 156  * Owner thread pointer when lock held in spin mode
 157  */
 158 #define LCK_MTX_SPIN_TAG  0xfffffff0
 159
 160
 161 #define interlock_lock(lock)    hw_lock_bit    ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
 162 #define interlock_try(lock)             hw_lock_bit_try((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
 163 #define interlock_unlock(lock)  hw_unlock_bit  ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
 164 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
 165 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
 166
 167 #define load_memory_barrier()   os_atomic_thread_fence(acquire)
 168
 169 // Enforce program order of loads and stores.
 170 #define ordered_load(target) \
 171                 os_atomic_load(target, compiler_acq_rel)
 172 #define ordered_store(target, value) \
 173                 os_atomic_store(target, value, compiler_acq_rel)
 174
 175 #define ordered_load_mtx(lock)                  ordered_load(&(lock)->lck_mtx_data)
 176 #define ordered_store_mtx(lock, value)  ordered_store(&(lock)->lck_mtx_data, (value))
 177 #define ordered_load_rw(lock)                   ordered_load(&(lock)->lck_rw_data)
 178 #define ordered_store_rw(lock, value)   ordered_store(&(lock)->lck_rw_data, (value))
 179 #define ordered_load_rw_owner(lock)             ordered_load(&(lock)->lck_rw_owner)
 180 #define ordered_store_rw_owner(lock, value)     ordered_store(&(lock)->lck_rw_owner, (value))
 181 #define ordered_load_hw(lock)                   ordered_load(&(lock)->lock_data)
 182 #define ordered_store_hw(lock, value)   ordered_store(&(lock)->lock_data, (value))
 183 #define ordered_load_bit(lock)                  ordered_load((lock))
 184 #define ordered_store_bit(lock, value)  ordered_store((lock), (value))
 185
 186
 187 // Prevent the compiler from reordering memory operations around this
 188 #define compiler_memory_fence() __asm__ volatile ("" ::: "memory")
 189
 190 #define LOCK_PANIC_TIMEOUT      0xc00000
 191 #define NOINLINE                __attribute__((noinline))
 192
 193
 194 #if __arm__
 195 #define interrupts_disabled(mask) (mask & PSR_INTMASK)
 196 #else
 197 #define interrupts_disabled(mask) (mask & DAIF_IRQF)
 198 #endif
 199
 200
 201 #if __arm__
 202 #define enable_fiq()            __asm__ volatile ("cpsie  f" ::: "memory");
 203 #define enable_interrupts()     __asm__ volatile ("cpsie if" ::: "memory");
 204 #endif
 205
 206 /*
 207  * Forward declarations
 208  */
 209
 210 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
 211 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
 212 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 213 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 214 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 215 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 216 static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait);
 217
 218 /*
 219  * atomic exchange API is a low level abstraction of the operations
 220  * to atomically read, modify, and write a pointer.  This abstraction works
 221  * for both Intel and ARMv8.1 compare and exchange atomic instructions as
 222  * well as the ARM exclusive instructions.
 223  *
 224  * atomic_exchange_begin() - begin exchange and retrieve current value
 225  * atomic_exchange_complete() - conclude an exchange
 226  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
 227  */
 228 __unused static uint32_t
 229 load_exclusive32(uint32_t *target, enum memory_order ord)
 230 {
 231         uint32_t        value;
 232
 233 #if __arm__
 234         if (memory_order_has_release(ord)) {
 235                 // Pre-load release barrier
 236                 atomic_thread_fence(memory_order_release);
 237         }
 238         value = __builtin_arm_ldrex(target);
 239 #else
 240         if (memory_order_has_acquire(ord)) {
 241                 value = __builtin_arm_ldaex(target);    // ldaxr
 242         } else {
 243                 value = __builtin_arm_ldrex(target);    // ldxr
 244         }
 245 #endif  // __arm__
 246         return value;
 247 }
 248
 249 __unused static boolean_t
 250 store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord)
 251 {
 252         boolean_t err;
 253
 254 #if __arm__
 255         err = __builtin_arm_strex(value, target);
 256         if (memory_order_has_acquire(ord)) {
 257                 // Post-store acquire barrier
 258                 atomic_thread_fence(memory_order_acquire);
 259         }
 260 #else
 261         if (memory_order_has_release(ord)) {
 262                 err = __builtin_arm_stlex(value, target);       // stlxr
 263         } else {
 264                 err = __builtin_arm_strex(value, target);       // stxr
 265         }
 266 #endif  // __arm__
 267         return !err;
 268 }
 269
 270 static uint32_t
 271 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 272 {
 273         uint32_t        val;
 274
 275 #if __ARM_ATOMICS_8_1
 276         ord = memory_order_relaxed;
 277 #endif
 278         val = load_exclusive32(target, ord);
 279         *previous = val;
 280         return val;
 281 }
 282
 283 static boolean_t
 284 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 285 {
 286 #if __ARM_ATOMICS_8_1
 287         return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
 288 #else
 289         (void)previous;         // Previous not needed, monitor is held
 290         return store_exclusive32(target, newval, ord);
 291 #endif
 292 }
 293
 294 static void
 295 atomic_exchange_abort(void)
 296 {
 297         os_atomic_clear_exclusive();
 298 }
 299
 300 static boolean_t
 301 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 302 {
 303         uint32_t                value, prev;
 304
 305         for (;;) {
 306                 value = atomic_exchange_begin32(target, &prev, ord);
 307                 if (value & test_mask) {
 308                         if (wait) {
 309                                 wait_for_event();       // Wait with monitor held
 310                         } else {
 311                                 atomic_exchange_abort();        // Clear exclusive monitor
 312                         }
 313                         return FALSE;
 314                 }
 315                 value |= set_mask;
 316                 if (atomic_exchange_complete32(target, prev, value, ord)) {
 317                         return TRUE;
 318                 }
 319         }
 320 }
 321
 322 inline boolean_t
 323 hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 324 {
 325         return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
 326 }
 327
 328 void
 329 _disable_preemption(void)
 330 {
 331         thread_t     thread = current_thread();
 332         unsigned int count  = thread->machine.preemption_count;
 333
 334         count += 1;
 335         if (__improbable(count == 0)) {
 336                 panic("Preemption count overflow");
 337         }
 338
 339         os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
 340 }
 341
 342 /*
 343  * This function checks whether an AST_URGENT has been pended.
 344  *
 345  * It is called once the preemption has been reenabled, which means the thread
 346  * may have been preempted right before this was called, and when this function
 347  * actually performs the check, we've changed CPU.
 348  *
 349  * This race is however benign: the point of AST_URGENT is to trigger a context
 350  * switch, so if one happened, there's nothing left to check for, and AST_URGENT
 351  * was cleared in the process.
 352  *
 353  * It follows that this check cannot have false negatives, which allows us
 354  * to avoid fiddling with interrupt state for the vast majority of cases
 355  * when the check will actually be negative.
 356  */
 357 static NOINLINE void
 358 kernel_preempt_check(thread_t thread)
 359 {
 360         cpu_data_t *cpu_data_ptr;
 361         long        state;
 362
 363 #if __arm__
 364 #define INTERRUPT_MASK PSR_IRQF
 365 #else   // __arm__
 366 #define INTERRUPT_MASK DAIF_IRQF
 367 #endif  // __arm__
 368
 369         /*
 370          * This check is racy and could load from another CPU's pending_ast mask,
 371          * but as described above, this can't have false negatives.
 372          */
 373         cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
 374         if (__probable((cpu_data_ptr->cpu_pending_ast & AST_URGENT) == 0)) {
 375                 return;
 376         }
 377
 378         /* If interrupts are masked, we can't take an AST here */
 379         state = get_interrupts();
 380         if ((state & INTERRUPT_MASK) == 0) {
 381                 disable_interrupts_noread();                    // Disable interrupts
 382
 383                 /*
 384                  * Reload cpu_data_ptr: a context switch would cause it to change.
 385                  * Now that interrupts are disabled, this will debounce false positives.
 386                  */
 387                 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
 388                 if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
 389 #if __arm__
 390 #if __ARM_USER_PROTECT__
 391                         uintptr_t up = arm_user_protect_begin(thread);
 392 #endif  // __ARM_USER_PROTECT__
 393                         enable_fiq();
 394 #endif  // __arm__
 395                         ast_taken_kernel();                 // Handle urgent AST
 396 #if __arm__
 397 #if __ARM_USER_PROTECT__
 398                         arm_user_protect_end(thread, up, TRUE);
 399 #endif  // __ARM_USER_PROTECT__
 400                         enable_interrupts();
 401                         return;                             // Return early on arm only due to FIQ enabling
 402 #endif  // __arm__
 403                 }
 404                 restore_interrupts(state);              // Enable interrupts
 405         }
 406 }
 407
 408 void
 409 _enable_preemption(void)
 410 {
 411         thread_t     thread = current_thread();
 412         unsigned int count  = thread->machine.preemption_count;
 413
 414         if (__improbable(count == 0)) {
 415                 panic("Preemption count underflow");
 416         }
 417         count -= 1;
 418
 419         os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
 420         if (count == 0) {
 421                 kernel_preempt_check(thread);
 422         }
 423 }
 424
 425 int
 426 get_preemption_level(void)
 427 {
 428         return current_thread()->machine.preemption_count;
 429 }
 430
 431 #if __SMP__
 432 static inline boolean_t
 433 interlock_try_disable_interrupts(
 434         lck_mtx_t *mutex,
 435         boolean_t *istate)
 436 {
 437         *istate = ml_set_interrupts_enabled(FALSE);
 438
 439         if (interlock_try(mutex)) {
 440                 return 1;
 441         } else {
 442                 ml_set_interrupts_enabled(*istate);
 443                 return 0;
 444         }
 445 }
 446
 447 static inline void
 448 interlock_unlock_enable_interrupts(
 449         lck_mtx_t *mutex,
 450         boolean_t istate)
 451 {
 452         interlock_unlock(mutex);
 453         ml_set_interrupts_enabled(istate);
 454 }
 455 #endif /* __SMP__ */
 456
 457 /*
 458  *      Routine:        lck_spin_alloc_init
 459  */
 460 lck_spin_t     *
 461 lck_spin_alloc_init(
 462         lck_grp_t * grp,
 463         lck_attr_t * attr)
 464 {
 465         lck_spin_t     *lck;
 466
 467         if ((lck = (lck_spin_t *) kalloc(sizeof(lck_spin_t))) != 0) {
 468                 lck_spin_init(lck, grp, attr);
 469         }
 470
 471         return lck;
 472 }
 473
 474 /*
 475  *      Routine:        lck_spin_free
 476  */
 477 void
 478 lck_spin_free(
 479         lck_spin_t * lck,
 480         lck_grp_t * grp)
 481 {
 482         lck_spin_destroy(lck, grp);
 483         kfree(lck, sizeof(lck_spin_t));
 484 }
 485
 486 /*
 487  *      Routine:        lck_spin_init
 488  */
 489 void
 490 lck_spin_init(
 491         lck_spin_t * lck,
 492         lck_grp_t * grp,
 493         __unused lck_attr_t * attr)
 494 {
 495         lck->type = LCK_SPIN_TYPE;
 496         hw_lock_init(&lck->hwlock);
 497         if (grp) {
 498                 lck_grp_reference(grp);
 499                 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 500         }
 501 }
 502
 503 /*
 504  * arm_usimple_lock is a lck_spin_t without a group or attributes
 505  */
 506 void inline
 507 arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
 508 {
 509         lck->type = LCK_SPIN_TYPE;
 510         hw_lock_init(&lck->hwlock);
 511 }
 512
 513
 514 /*
 515  *      Routine:        lck_spin_lock
 516  */
 517 void
 518 lck_spin_lock(lck_spin_t *lock)
 519 {
 520 #if     DEVELOPMENT || DEBUG
 521         if (lock->type != LCK_SPIN_TYPE) {
 522                 panic("Invalid spinlock %p", lock);
 523         }
 524 #endif  // DEVELOPMENT || DEBUG
 525         hw_lock_lock(&lock->hwlock, LCK_GRP_NULL);
 526 }
 527
 528 void
 529 lck_spin_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
 530 {
 531 #pragma unused(grp)
 532 #if     DEVELOPMENT || DEBUG
 533         if (lock->type != LCK_SPIN_TYPE) {
 534                 panic("Invalid spinlock %p", lock);
 535         }
 536 #endif  // DEVELOPMENT || DEBUG
 537         hw_lock_lock(&lock->hwlock, grp);
 538 }
 539
 540 /*
 541  *      Routine:        lck_spin_lock_nopreempt
 542  */
 543 void
 544 lck_spin_lock_nopreempt(lck_spin_t *lock)
 545 {
 546 #if     DEVELOPMENT || DEBUG
 547         if (lock->type != LCK_SPIN_TYPE) {
 548                 panic("Invalid spinlock %p", lock);
 549         }
 550 #endif  // DEVELOPMENT || DEBUG
 551         hw_lock_lock_nopreempt(&lock->hwlock, LCK_GRP_NULL);
 552 }
 553
 554 void
 555 lck_spin_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
 556 {
 557 #pragma unused(grp)
 558 #if     DEVELOPMENT || DEBUG
 559         if (lock->type != LCK_SPIN_TYPE) {
 560                 panic("Invalid spinlock %p", lock);
 561         }
 562 #endif  // DEVELOPMENT || DEBUG
 563         hw_lock_lock_nopreempt(&lock->hwlock, grp);
 564 }
 565
 566 /*
 567  *      Routine:        lck_spin_try_lock
 568  */
 569 int
 570 lck_spin_try_lock(lck_spin_t *lock)
 571 {
 572         return hw_lock_try(&lock->hwlock, LCK_GRP_NULL);
 573 }
 574
 575 int
 576 lck_spin_try_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
 577 {
 578 #pragma unused(grp)
 579         return hw_lock_try(&lock->hwlock, grp);
 580 }
 581
 582 /*
 583  *      Routine:        lck_spin_try_lock_nopreempt
 584  */
 585 int
 586 lck_spin_try_lock_nopreempt(lck_spin_t *lock)
 587 {
 588         return hw_lock_try_nopreempt(&lock->hwlock, LCK_GRP_NULL);
 589 }
 590
 591 int
 592 lck_spin_try_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
 593 {
 594 #pragma unused(grp)
 595         return hw_lock_try_nopreempt(&lock->hwlock, grp);
 596 }
 597
 598 /*
 599  *      Routine:        lck_spin_unlock
 600  */
 601 void
 602 lck_spin_unlock(lck_spin_t *lock)
 603 {
 604 #if     DEVELOPMENT || DEBUG
 605         if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
 606                 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
 607         }
 608         if (lock->type != LCK_SPIN_TYPE) {
 609                 panic("Invalid spinlock type %p", lock);
 610         }
 611 #endif  // DEVELOPMENT || DEBUG
 612         hw_lock_unlock(&lock->hwlock);
 613 }
 614
 615 /*
 616  *      Routine:        lck_spin_unlock_nopreempt
 617  */
 618 void
 619 lck_spin_unlock_nopreempt(lck_spin_t *lock)
 620 {
 621 #if     DEVELOPMENT || DEBUG
 622         if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
 623                 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
 624         }
 625         if (lock->type != LCK_SPIN_TYPE) {
 626                 panic("Invalid spinlock type %p", lock);
 627         }
 628 #endif  // DEVELOPMENT || DEBUG
 629         hw_lock_unlock_nopreempt(&lock->hwlock);
 630 }
 631
 632 /*
 633  *      Routine:        lck_spin_destroy
 634  */
 635 void
 636 lck_spin_destroy(
 637         lck_spin_t * lck,
 638         lck_grp_t * grp)
 639 {
 640         if (lck->lck_spin_data == LCK_SPIN_TAG_DESTROYED) {
 641                 return;
 642         }
 643         lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED;
 644         if (grp) {
 645                 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 646                 lck_grp_deallocate(grp);
 647         }
 648 }
 649
 650 /*
 651  * Routine: kdp_lck_spin_is_acquired
 652  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 653  */
 654 boolean_t
 655 kdp_lck_spin_is_acquired(lck_spin_t *lck)
 656 {
 657         if (not_in_kdp) {
 658                 panic("panic: spinlock acquired check done outside of kernel debugger");
 659         }
 660         return ((lck->lck_spin_data & ~LCK_SPIN_TAG_DESTROYED) != 0) ? TRUE:FALSE;
 661 }
 662
 663 /*
 664  *      Initialize a usimple_lock.
 665  *
 666  *      No change in preemption state.
 667  */
 668 void
 669 usimple_lock_init(
 670         usimple_lock_t l,
 671         unsigned short tag)
 672 {
 673         simple_lock_init((simple_lock_t) l, tag);
 674 }
 675
 676
 677 /*
 678  *      Acquire a usimple_lock.
 679  *
 680  *      Returns with preemption disabled.  Note
 681  *      that the hw_lock routines are responsible for
 682  *      maintaining preemption state.
 683  */
 684 void
 685 (usimple_lock)(
 686         usimple_lock_t l
 687         LCK_GRP_ARG(lck_grp_t *grp))
 688 {
 689         simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp));
 690 }
 691
 692
 693 extern void     sync(void);
 694
 695 /*
 696  *      Release a usimple_lock.
 697  *
 698  *      Returns with preemption enabled.  Note
 699  *      that the hw_lock routines are responsible for
 700  *      maintaining preemption state.
 701  */
 702 void
 703 (usimple_unlock)(
 704         usimple_lock_t l)
 705 {
 706         simple_unlock((simple_lock_t)l);
 707 }
 708
 709
 710 /*
 711  *      Conditionally acquire a usimple_lock.
 712  *
 713  *      On success, returns with preemption disabled.
 714  *      On failure, returns with preemption in the same state
 715  *      as when first invoked.  Note that the hw_lock routines
 716  *      are responsible for maintaining preemption state.
 717  *
 718  *      XXX No stats are gathered on a miss; I preserved this
 719  *      behavior from the original assembly-language code, but
 720  *      doesn't it make sense to log misses?  XXX
 721  */
 722 unsigned
 723 int
 724 (usimple_lock_try)(
 725         usimple_lock_t l
 726         LCK_GRP_ARG(lck_grp_t *grp))
 727 {
 728         return simple_lock_try((simple_lock_t) l, grp);
 729 }
 730
 731 /*
 732  * The C portion of the shared/exclusive locks package.
 733  */
 734
 735 /*
 736  * compute the deadline to spin against when
 737  * waiting for a change of state on a lck_rw_t
 738  */
 739 #if     __SMP__
 740 static inline uint64_t
 741 lck_rw_deadline_for_spin(lck_rw_t *lck)
 742 {
 743         lck_rw_word_t   word;
 744
 745         word.data = ordered_load_rw(lck);
 746         if (word.can_sleep) {
 747                 if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
 748                         /*
 749                          * there are already threads waiting on this lock... this
 750                          * implies that they have spun beyond their deadlines waiting for
 751                          * the desired state to show up so we will not bother spinning at this time...
 752                          *   or
 753                          * the current number of threads sharing this lock exceeds our capacity to run them
 754                          * concurrently and since all states we're going to spin for require the rw_shared_count
 755                          * to be at 0, we'll not bother spinning since the latency for this to happen is
 756                          * unpredictable...
 757                          */
 758                         return mach_absolute_time();
 759                 }
 760                 return mach_absolute_time() + MutexSpin;
 761         } else {
 762                 return mach_absolute_time() + (100000LL * 1000000000LL);
 763         }
 764 }
 765 #endif  // __SMP__
 766
 767 static boolean_t
 768 lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unused)
 769 {
 770 #if     __SMP__
 771         uint64_t        deadline = 0;
 772         uint32_t        data;
 773
 774         if (wait) {
 775                 deadline = lck_rw_deadline_for_spin(lock);
 776         }
 777
 778         for (;;) {
 779                 data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
 780                 if ((data & status_mask) == 0) {
 781                         break;
 782                 }
 783                 if (wait) {
 784                         wait_for_event();
 785                 } else {
 786                         os_atomic_clear_exclusive();
 787                 }
 788                 if (!wait || (mach_absolute_time() >= deadline)) {
 789                         return FALSE;
 790                 }
 791         }
 792         os_atomic_clear_exclusive();
 793         return TRUE;
 794 #else
 795         uint32_t        data;
 796
 797         data = ordered_load_rw(lock);
 798         if ((data & status_mask) == 0) {
 799                 return TRUE;
 800         } else {
 801                 return FALSE;
 802         }
 803 #endif  // __SMP__
 804 }
 805
 806 /*
 807  * Spin while interlock is held.
 808  */
 809 static inline void
 810 lck_rw_interlock_spin(lck_rw_t *lock)
 811 {
 812 #if __SMP__
 813         uint32_t        data;
 814
 815         for (;;) {
 816                 data = load_exclusive32(&lock->lck_rw_data, memory_order_relaxed);
 817                 if (data & LCK_RW_INTERLOCK) {
 818                         wait_for_event();
 819                 } else {
 820                         os_atomic_clear_exclusive();
 821                         return;
 822                 }
 823         }
 824 #else
 825         panic("lck_rw_interlock_spin(): Interlock locked %p %x", lock, lock->lck_rw_data);
 826 #endif
 827 }
 828
 829 /*
 830  * We disable interrupts while holding the RW interlock to prevent an
 831  * interrupt from exacerbating hold time.
 832  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 833  */
 834 static inline boolean_t
 835 lck_interlock_lock(lck_rw_t *lck)
 836 {
 837         boolean_t       istate;
 838
 839         istate = ml_set_interrupts_enabled(FALSE);
 840         lck_rw_ilk_lock(lck);
 841         return istate;
 842 }
 843
 844 static inline void
 845 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 846 {
 847         lck_rw_ilk_unlock(lck);
 848         ml_set_interrupts_enabled(istate);
 849 }
 850
 851
 852 #define LCK_RW_GRAB_WANT        0
 853 #define LCK_RW_GRAB_SHARED      1
 854
 855 static boolean_t
 856 lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait)
 857 {
 858         uint64_t        deadline = 0;
 859         uint32_t        data, prev;
 860         boolean_t       do_exch;
 861
 862 #if __SMP__
 863         if (wait) {
 864                 deadline = lck_rw_deadline_for_spin(lock);
 865         }
 866 #else
 867         wait = FALSE;   // Don't spin on UP systems
 868 #endif
 869
 870         for (;;) {
 871                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
 872                 if (data & LCK_RW_INTERLOCK) {
 873                         atomic_exchange_abort();
 874                         lck_rw_interlock_spin(lock);
 875                         continue;
 876                 }
 877                 do_exch = FALSE;
 878                 if (mode == LCK_RW_GRAB_WANT) {
 879                         if ((data & LCK_RW_WANT_EXCL) == 0) {
 880                                 data |= LCK_RW_WANT_EXCL;
 881                                 do_exch = TRUE;
 882                         }
 883                 } else {        // LCK_RW_GRAB_SHARED
 884                         if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
 885                             (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
 886                                 data += LCK_RW_SHARED_READER;
 887                                 do_exch = TRUE;
 888                         }
 889                 }
 890                 if (do_exch) {
 891                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
 892                                 return TRUE;
 893                         }
 894                 } else {
 895                         if (wait) {                                             // Non-waiting
 896                                 wait_for_event();
 897                         } else {
 898                                 atomic_exchange_abort();
 899                         }
 900                         if (!wait || (mach_absolute_time() >= deadline)) {
 901                                 return FALSE;
 902                         }
 903                 }
 904         }
 905 }
 906
 907
 908 /*
 909  *      Routine:        lck_rw_alloc_init
 910  */
 911 lck_rw_t *
 912 lck_rw_alloc_init(
 913         lck_grp_t       *grp,
 914         lck_attr_t      *attr)
 915 {
 916         lck_rw_t        *lck;
 917
 918         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 919                 lck_rw_init(lck, grp, attr);
 920         }
 921
 922         return lck;
 923 }
 924
 925 /*
 926  *      Routine:        lck_rw_free
 927  */
 928 void
 929 lck_rw_free(
 930         lck_rw_t        *lck,
 931         lck_grp_t       *grp)
 932 {
 933         lck_rw_destroy(lck, grp);
 934         kfree(lck, sizeof(lck_rw_t));
 935 }
 936
 937 /*
 938  *      Routine:        lck_rw_init
 939  */
 940 void
 941 lck_rw_init(
 942         lck_rw_t        *lck,
 943         lck_grp_t       *grp,
 944         lck_attr_t      *attr)
 945 {
 946         if (attr == LCK_ATTR_NULL) {
 947                 attr = &LockDefaultLckAttr;
 948         }
 949         memset(lck, 0, sizeof(lck_rw_t));
 950         lck->lck_rw_can_sleep = TRUE;
 951         if ((attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY) == 0) {
 952                 lck->lck_rw_priv_excl = TRUE;
 953         }
 954
 955         lck_grp_reference(grp);
 956         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 957 }
 958
 959
 960 /*
 961  *      Routine:        lck_rw_destroy
 962  */
 963 void
 964 lck_rw_destroy(
 965         lck_rw_t        *lck,
 966         lck_grp_t       *grp)
 967 {
 968         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
 969                 return;
 970         }
 971 #if MACH_LDEBUG
 972         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 973 #endif
 974         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 975         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 976         lck_grp_deallocate(grp);
 977         return;
 978 }
 979
 980 /*
 981  *      Routine:        lck_rw_lock
 982  */
 983 void
 984 lck_rw_lock(
 985         lck_rw_t                *lck,
 986         lck_rw_type_t   lck_rw_type)
 987 {
 988         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
 989                 lck_rw_lock_shared(lck);
 990         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
 991                 lck_rw_lock_exclusive(lck);
 992         } else {
 993                 panic("lck_rw_lock(): Invalid RW lock type: %x", lck_rw_type);
 994         }
 995 }
 996
 997 /*
 998  *      Routine:        lck_rw_lock_exclusive
 999  */
1000 void
1001 lck_rw_lock_exclusive(lck_rw_t *lock)
1002 {
1003         thread_t        thread = current_thread();
1004
1005         thread->rwlock_count++;
1006         if (atomic_test_and_set32(&lock->lck_rw_data,
1007             (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1008             LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1009 #if     CONFIG_DTRACE
1010                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1011 #endif  /* CONFIG_DTRACE */
1012         } else {
1013                 lck_rw_lock_exclusive_gen(lock);
1014         }
1015 #if MACH_ASSERT
1016         thread_t owner = ordered_load_rw_owner(lock);
1017         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1018 #endif
1019         ordered_store_rw_owner(lock, thread);
1020 }
1021
1022 /*
1023  *      Routine:        lck_rw_lock_shared
1024  */
1025 void
1026 lck_rw_lock_shared(lck_rw_t *lock)
1027 {
1028         uint32_t        data, prev;
1029
1030         current_thread()->rwlock_count++;
1031         for (;;) {
1032                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1033                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1034                         atomic_exchange_abort();
1035                         lck_rw_lock_shared_gen(lock);
1036                         break;
1037                 }
1038                 data += LCK_RW_SHARED_READER;
1039                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1040                         break;
1041                 }
1042                 cpu_pause();
1043         }
1044 #if MACH_ASSERT
1045         thread_t owner = ordered_load_rw_owner(lock);
1046         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1047 #endif
1048 #if     CONFIG_DTRACE
1049         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1050 #endif  /* CONFIG_DTRACE */
1051         return;
1052 }
1053
1054 /*
1055  *      Routine:        lck_rw_lock_shared_to_exclusive
1056  *
1057  *      False returned upon failure, in this case the shared lock is dropped.
1058  */
1059 boolean_t
1060 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1061 {
1062         uint32_t        data, prev;
1063
1064         for (;;) {
1065                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1066                 if (data & LCK_RW_INTERLOCK) {
1067                         atomic_exchange_abort();
1068                         lck_rw_interlock_spin(lock);
1069                         continue;
1070                 }
1071                 if (data & LCK_RW_WANT_UPGRADE) {
1072                         data -= LCK_RW_SHARED_READER;
1073                         if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1074                                 data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1075                         }
1076                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1077                                 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1078                         }
1079                 } else {
1080                         data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1081                         data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1082                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1083                                 break;
1084                         }
1085                 }
1086                 cpu_pause();
1087         }
1088         /* we now own the WANT_UPGRADE */
1089         if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1090                 lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1091         }
1092 #if MACH_ASSERT
1093         thread_t owner = ordered_load_rw_owner(lock);
1094         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1095 #endif
1096         ordered_store_rw_owner(lock, current_thread());
1097 #if     CONFIG_DTRACE
1098         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1099 #endif  /* CONFIG_DTRACE */
1100         return TRUE;
1101 }
1102
1103
1104 /*
1105  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1106  *      Function:
1107  *              Fast path code has already dropped our read
1108  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1109  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1110  *              all we need to do here is determine if a wakeup is needed
1111  */
1112 static boolean_t
1113 lck_rw_lock_shared_to_exclusive_failure(
1114         lck_rw_t        *lck,
1115         uint32_t        prior_lock_state)
1116 {
1117         thread_t        thread = current_thread();
1118         uint32_t        rwlock_count;
1119
1120         /* Check if dropping the lock means that we need to unpromote */
1121         rwlock_count = thread->rwlock_count--;
1122 #if MACH_LDEBUG
1123         if (rwlock_count == 0) {
1124                 panic("rw lock count underflow for thread %p", thread);
1125         }
1126 #endif
1127         if ((prior_lock_state & LCK_RW_W_WAITING) &&
1128             ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1129                 /*
1130                  *      Someone else has requested upgrade.
1131                  *      Since we've released the read lock, wake
1132                  *      him up if he's blocked waiting
1133                  */
1134                 thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1135         }
1136
1137         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1138                 /* sched_flags checked without lock, but will be rechecked while clearing */
1139                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1140         }
1141
1142         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1143             VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1144
1145         return FALSE;
1146 }
1147
1148 /*
1149  *      Routine:        lck_rw_lock_shared_to_exclusive_success
1150  *      Function:
1151  *              assembly fast path code has already dropped our read
1152  *              count and successfully acquired 'lck_rw_want_upgrade'
1153  *              we just need to wait for the rest of the readers to drain
1154  *              and then we can return as the exclusive holder of this lock
1155  */
1156 static boolean_t
1157 lck_rw_lock_shared_to_exclusive_success(
1158         lck_rw_t        *lock)
1159 {
1160         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1161         int                     slept = 0;
1162         lck_rw_word_t           word;
1163         wait_result_t           res;
1164         boolean_t               istate;
1165         boolean_t               not_shared;
1166
1167 #if     CONFIG_DTRACE
1168         uint64_t                wait_interval = 0;
1169         int                     readers_at_sleep = 0;
1170         boolean_t               dtrace_ls_initialized = FALSE;
1171         boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1172 #endif
1173
1174         while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE)) {
1175                 word.data = ordered_load_rw(lock);
1176 #if     CONFIG_DTRACE
1177                 if (dtrace_ls_initialized == FALSE) {
1178                         dtrace_ls_initialized = TRUE;
1179                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1180                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1181                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1182                         if (dtrace_ls_enabled) {
1183                                 /*
1184                                  * Either sleeping or spinning is happening,
1185                                  *  start a timing of our delay interval now.
1186                                  */
1187                                 readers_at_sleep = word.shared_count;
1188                                 wait_interval = mach_absolute_time();
1189                         }
1190                 }
1191 #endif
1192
1193                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1194                     trace_lck, word.shared_count, 0, 0, 0);
1195
1196                 not_shared = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE);
1197
1198                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1199                     trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1200
1201                 if (not_shared) {
1202                         break;
1203                 }
1204
1205                 /*
1206                  * if we get here, the spin deadline in lck_rw_wait_on_status()
1207                  * has expired w/o the rw_shared_count having drained to 0
1208                  * check to see if we're allowed to do a thread_block
1209                  */
1210                 if (word.can_sleep) {
1211                         istate = lck_interlock_lock(lock);
1212
1213                         word.data = ordered_load_rw(lock);
1214                         if (word.shared_count != 0) {
1215                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1216                                     trace_lck, word.shared_count, 0, 0, 0);
1217
1218                                 word.w_waiting = 1;
1219                                 ordered_store_rw(lock, word.data);
1220
1221                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1222                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1223                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1224                                 lck_interlock_unlock(lock, istate);
1225
1226                                 if (res == THREAD_WAITING) {
1227                                         res = thread_block(THREAD_CONTINUE_NULL);
1228                                         slept++;
1229                                 }
1230                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1231                                     trace_lck, res, slept, 0, 0);
1232                         } else {
1233                                 lck_interlock_unlock(lock, istate);
1234                                 break;
1235                         }
1236                 }
1237         }
1238 #if     CONFIG_DTRACE
1239         /*
1240          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1241          */
1242         if (dtrace_ls_enabled == TRUE) {
1243                 if (slept == 0) {
1244                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1245                 } else {
1246                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1247                             mach_absolute_time() - wait_interval, 1,
1248                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1249                 }
1250         }
1251         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1252 #endif
1253         return TRUE;
1254 }
1255
1256
1257 /*
1258  *      Routine:        lck_rw_lock_exclusive_to_shared
1259  */
1260
1261 void
1262 lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1263 {
1264         uint32_t        data, prev;
1265
1266         assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1267         ordered_store_rw_owner(lock, THREAD_NULL);
1268         for (;;) {
1269                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1270                 if (data & LCK_RW_INTERLOCK) {
1271 #if __SMP__
1272                         atomic_exchange_abort();
1273                         lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1274                         continue;
1275 #else
1276                         panic("lck_rw_lock_exclusive_to_shared(): Interlock locked (%p): %x", lock, data);
1277 #endif // __SMP__
1278                 }
1279                 data += LCK_RW_SHARED_READER;
1280                 if (data & LCK_RW_WANT_UPGRADE) {
1281                         data &= ~(LCK_RW_WANT_UPGRADE);
1282                 } else {
1283                         data &= ~(LCK_RW_WANT_EXCL);
1284                 }
1285                 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1286                         data &= ~(LCK_RW_W_WAITING);
1287                 }
1288                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1289                         break;
1290                 }
1291                 cpu_pause();
1292         }
1293         return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1294 }
1295
1296 /*
1297  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1298  *      Function:
1299  *              Fast path has already dropped
1300  *              our exclusive state and bumped lck_rw_shared_count
1301  *              all we need to do here is determine if anyone
1302  *              needs to be awakened.
1303  */
1304 static void
1305 lck_rw_lock_exclusive_to_shared_gen(
1306         lck_rw_t        *lck,
1307         uint32_t        prior_lock_state)
1308 {
1309         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1310         lck_rw_word_t   fake_lck;
1311
1312         /*
1313          * prior_lock state is a snapshot of the 1st word of the
1314          * lock in question... we'll fake up a pointer to it
1315          * and carefully not access anything beyond whats defined
1316          * in the first word of a lck_rw_t
1317          */
1318         fake_lck.data = prior_lock_state;
1319
1320         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1321             trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1322
1323         /*
1324          * don't wake up anyone waiting to take the lock exclusively
1325          * since we hold a read count... when the read count drops to 0,
1326          * the writers will be woken.
1327          *
1328          * wake up any waiting readers if we don't have any writers waiting,
1329          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1330          */
1331         if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1332                 thread_wakeup(LCK_RW_READER_EVENT(lck));
1333         }
1334
1335         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1336             trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1337
1338 #if CONFIG_DTRACE
1339         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1340 #endif
1341 }
1342
1343
1344 /*
1345  *      Routine:        lck_rw_try_lock
1346  */
1347 boolean_t
1348 lck_rw_try_lock(
1349         lck_rw_t                *lck,
1350         lck_rw_type_t   lck_rw_type)
1351 {
1352         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1353                 return lck_rw_try_lock_shared(lck);
1354         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1355                 return lck_rw_try_lock_exclusive(lck);
1356         } else {
1357                 panic("lck_rw_try_lock(): Invalid rw lock type: %x", lck_rw_type);
1358         }
1359         return FALSE;
1360 }
1361
1362 /*
1363  *      Routine:        lck_rw_try_lock_shared
1364  */
1365
1366 boolean_t
1367 lck_rw_try_lock_shared(lck_rw_t *lock)
1368 {
1369         uint32_t        data, prev;
1370
1371         for (;;) {
1372                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1373                 if (data & LCK_RW_INTERLOCK) {
1374 #if __SMP__
1375                         atomic_exchange_abort();
1376                         lck_rw_interlock_spin(lock);
1377                         continue;
1378 #else
1379                         panic("lck_rw_try_lock_shared(): Interlock locked (%p): %x", lock, data);
1380 #endif
1381                 }
1382                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1383                         atomic_exchange_abort();
1384                         return FALSE;                                           /* lock is busy */
1385                 }
1386                 data += LCK_RW_SHARED_READER;                   /* Increment reader refcount */
1387                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1388                         break;
1389                 }
1390                 cpu_pause();
1391         }
1392 #if MACH_ASSERT
1393         thread_t owner = ordered_load_rw_owner(lock);
1394         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1395 #endif
1396         current_thread()->rwlock_count++;
1397 #if     CONFIG_DTRACE
1398         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1399 #endif  /* CONFIG_DTRACE */
1400         return TRUE;
1401 }
1402
1403
1404 /*
1405  *      Routine:        lck_rw_try_lock_exclusive
1406  */
1407
1408 boolean_t
1409 lck_rw_try_lock_exclusive(lck_rw_t *lock)
1410 {
1411         uint32_t        data, prev;
1412         thread_t        thread;
1413
1414         for (;;) {
1415                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1416                 if (data & LCK_RW_INTERLOCK) {
1417 #if __SMP__
1418                         atomic_exchange_abort();
1419                         lck_rw_interlock_spin(lock);
1420                         continue;
1421 #else
1422                         panic("lck_rw_try_lock_exclusive(): Interlock locked (%p): %x", lock, data);
1423 #endif
1424                 }
1425                 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1426                         atomic_exchange_abort();
1427                         return FALSE;
1428                 }
1429                 data |= LCK_RW_WANT_EXCL;
1430                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1431                         break;
1432                 }
1433                 cpu_pause();
1434         }
1435         thread = current_thread();
1436         thread->rwlock_count++;
1437 #if MACH_ASSERT
1438         thread_t owner = ordered_load_rw_owner(lock);
1439         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1440 #endif
1441         ordered_store_rw_owner(lock, thread);
1442 #if     CONFIG_DTRACE
1443         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1444 #endif  /* CONFIG_DTRACE */
1445         return TRUE;
1446 }
1447
1448
1449 /*
1450  *      Routine:        lck_rw_unlock
1451  */
1452 void
1453 lck_rw_unlock(
1454         lck_rw_t                *lck,
1455         lck_rw_type_t   lck_rw_type)
1456 {
1457         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1458                 lck_rw_unlock_shared(lck);
1459         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1460                 lck_rw_unlock_exclusive(lck);
1461         } else {
1462                 panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
1463         }
1464 }
1465
1466
1467 /*
1468  *      Routine:        lck_rw_unlock_shared
1469  */
1470 void
1471 lck_rw_unlock_shared(
1472         lck_rw_t        *lck)
1473 {
1474         lck_rw_type_t   ret;
1475
1476         assertf(lck->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1477         assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
1478         ret = lck_rw_done(lck);
1479
1480         if (ret != LCK_RW_TYPE_SHARED) {
1481                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
1482         }
1483 }
1484
1485
1486 /*
1487  *      Routine:        lck_rw_unlock_exclusive
1488  */
1489 void
1490 lck_rw_unlock_exclusive(
1491         lck_rw_t        *lck)
1492 {
1493         lck_rw_type_t   ret;
1494
1495         assertf(lck->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1496         ret = lck_rw_done(lck);
1497
1498         if (ret != LCK_RW_TYPE_EXCLUSIVE) {
1499                 panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
1500         }
1501 }
1502
1503
1504 /*
1505  *      Routine:        lck_rw_lock_exclusive_gen
1506  */
1507 static void
1508 lck_rw_lock_exclusive_gen(
1509         lck_rw_t        *lock)
1510 {
1511         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1512         lck_rw_word_t           word;
1513         int                     slept = 0;
1514         boolean_t               gotlock = 0;
1515         boolean_t               not_shared_or_upgrade = 0;
1516         wait_result_t           res = 0;
1517         boolean_t               istate;
1518
1519 #if     CONFIG_DTRACE
1520         boolean_t dtrace_ls_initialized = FALSE;
1521         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
1522         uint64_t wait_interval = 0;
1523         int readers_at_sleep = 0;
1524 #endif
1525
1526         /*
1527          *      Try to acquire the lck_rw_want_excl bit.
1528          */
1529         while (!lck_rw_grab(lock, LCK_RW_GRAB_WANT, FALSE)) {
1530 #if     CONFIG_DTRACE
1531                 if (dtrace_ls_initialized == FALSE) {
1532                         dtrace_ls_initialized = TRUE;
1533                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1534                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1535                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1536                         if (dtrace_ls_enabled) {
1537                                 /*
1538                                  * Either sleeping or spinning is happening,
1539                                  *  start a timing of our delay interval now.
1540                                  */
1541                                 readers_at_sleep = lock->lck_rw_shared_count;
1542                                 wait_interval = mach_absolute_time();
1543                         }
1544                 }
1545 #endif
1546
1547                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1548
1549                 gotlock = lck_rw_grab(lock, LCK_RW_GRAB_WANT, TRUE);
1550
1551                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1552
1553                 if (gotlock) {
1554                         break;
1555                 }
1556                 /*
1557                  * if we get here, the deadline has expired w/o us
1558                  * being able to grab the lock exclusively
1559                  * check to see if we're allowed to do a thread_block
1560                  */
1561                 word.data = ordered_load_rw(lock);
1562                 if (word.can_sleep) {
1563                         istate = lck_interlock_lock(lock);
1564                         word.data = ordered_load_rw(lock);
1565
1566                         if (word.want_excl) {
1567                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1568
1569                                 word.w_waiting = 1;
1570                                 ordered_store_rw(lock, word.data);
1571
1572                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1573                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1574                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1575                                 lck_interlock_unlock(lock, istate);
1576
1577                                 if (res == THREAD_WAITING) {
1578                                         res = thread_block(THREAD_CONTINUE_NULL);
1579                                         slept++;
1580                                 }
1581                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1582                         } else {
1583                                 word.want_excl = 1;
1584                                 ordered_store_rw(lock, word.data);
1585                                 lck_interlock_unlock(lock, istate);
1586                                 break;
1587                         }
1588                 }
1589         }
1590         /*
1591          * Wait for readers (and upgrades) to finish...
1592          */
1593         while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE)) {
1594 #if     CONFIG_DTRACE
1595                 /*
1596                  * Either sleeping or spinning is happening, start
1597                  * a timing of our delay interval now.  If we set it
1598                  * to -1 we don't have accurate data so we cannot later
1599                  * decide to record a dtrace spin or sleep event.
1600                  */
1601                 if (dtrace_ls_initialized == FALSE) {
1602                         dtrace_ls_initialized = TRUE;
1603                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1604                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1605                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1606                         if (dtrace_ls_enabled) {
1607                                 /*
1608                                  * Either sleeping or spinning is happening,
1609                                  *  start a timing of our delay interval now.
1610                                  */
1611                                 readers_at_sleep = lock->lck_rw_shared_count;
1612                                 wait_interval = mach_absolute_time();
1613                         }
1614                 }
1615 #endif
1616
1617                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1618
1619                 not_shared_or_upgrade = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE);
1620
1621                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, not_shared_or_upgrade, 0);
1622
1623                 if (not_shared_or_upgrade) {
1624                         break;
1625                 }
1626                 /*
1627                  * if we get here, the deadline has expired w/o us
1628                  * being able to grab the lock exclusively
1629                  * check to see if we're allowed to do a thread_block
1630                  */
1631                 word.data = ordered_load_rw(lock);
1632                 if (word.can_sleep) {
1633                         istate = lck_interlock_lock(lock);
1634                         word.data = ordered_load_rw(lock);
1635
1636                         if (word.shared_count != 0 || word.want_upgrade) {
1637                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1638
1639                                 word.w_waiting = 1;
1640                                 ordered_store_rw(lock, word.data);
1641
1642                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1643                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1644                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1645                                 lck_interlock_unlock(lock, istate);
1646
1647                                 if (res == THREAD_WAITING) {
1648                                         res = thread_block(THREAD_CONTINUE_NULL);
1649                                         slept++;
1650                                 }
1651                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1652                         } else {
1653                                 lck_interlock_unlock(lock, istate);
1654                                 /*
1655                                  * must own the lock now, since we checked for
1656                                  * readers or upgrade owner behind the interlock
1657                                  * no need for a call to 'lck_rw_drain_status'
1658                                  */
1659                                 break;
1660                         }
1661                 }
1662         }
1663
1664 #if     CONFIG_DTRACE
1665         /*
1666          * Decide what latencies we suffered that are Dtrace events.
1667          * If we have set wait_interval, then we either spun or slept.
1668          * At least we get out from under the interlock before we record
1669          * which is the best we can do here to minimize the impact
1670          * of the tracing.
1671          * If we have set wait_interval to -1, then dtrace was not enabled when we
1672          * started sleeping/spinning so we don't record this event.
1673          */
1674         if (dtrace_ls_enabled == TRUE) {
1675                 if (slept == 0) {
1676                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1677                             mach_absolute_time() - wait_interval, 1);
1678                 } else {
1679                         /*
1680                          * For the blocking case, we also record if when we blocked
1681                          * it was held for read or write, and how many readers.
1682                          * Notice that above we recorded this before we dropped
1683                          * the interlock so the count is accurate.
1684                          */
1685                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1686                             mach_absolute_time() - wait_interval, 1,
1687                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1688                 }
1689         }
1690         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1691 #endif  /* CONFIG_DTRACE */
1692 }
1693
1694 /*
1695  *      Routine:        lck_rw_done
1696  */
1697
1698 lck_rw_type_t
1699 lck_rw_done(lck_rw_t *lock)
1700 {
1701         uint32_t        data, prev;
1702         boolean_t       once = FALSE;
1703
1704         for (;;) {
1705                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1706                 if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
1707 #if __SMP__
1708                         atomic_exchange_abort();
1709                         lck_rw_interlock_spin(lock);
1710                         continue;
1711 #else
1712                         panic("lck_rw_done(): Interlock locked (%p): %x", lock, data);
1713 #endif // __SMP__
1714                 }
1715                 if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
1716                         assertf(lock->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1717                         data -= LCK_RW_SHARED_READER;
1718                         if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
1719                                 goto check_waiters;
1720                         }
1721                 } else {                                        /* if reader count == 0, must be exclusive lock */
1722                         if (data & LCK_RW_WANT_UPGRADE) {
1723                                 data &= ~(LCK_RW_WANT_UPGRADE);
1724                         } else {
1725                                 if (data & LCK_RW_WANT_EXCL) {
1726                                         data &= ~(LCK_RW_WANT_EXCL);
1727                                 } else {                                /* lock is not 'owned', panic */
1728                                         panic("Releasing non-exclusive RW lock without a reader refcount!");
1729                                 }
1730                         }
1731                         if (!once) {
1732                                 // Only check for holder and clear it once
1733                                 assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1734                                 ordered_store_rw_owner(lock, THREAD_NULL);
1735                                 once = TRUE;
1736                         }
1737 check_waiters:
1738                         /*
1739                          * test the original values to match what
1740                          * lck_rw_done_gen is going to do to determine
1741                          * which wakeups need to happen...
1742                          *
1743                          * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
1744                          */
1745                         if (prev & LCK_RW_W_WAITING) {
1746                                 data &= ~(LCK_RW_W_WAITING);
1747                                 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
1748                                         data &= ~(LCK_RW_R_WAITING);
1749                                 }
1750                         } else {
1751                                 data &= ~(LCK_RW_R_WAITING);
1752                         }
1753                 }
1754                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1755                         break;
1756                 }
1757                 cpu_pause();
1758         }
1759         return lck_rw_done_gen(lock, prev);
1760 }
1761
1762 /*
1763  *      Routine:        lck_rw_done_gen
1764  *
1765  *      called from the assembly language wrapper...
1766  *      prior_lock_state is the value in the 1st
1767  *      word of the lock at the time of a successful
1768  *      atomic compare and exchange with the new value...
1769  *      it represents the state of the lock before we
1770  *      decremented the rw_shared_count or cleared either
1771  *      rw_want_upgrade or rw_want_write and
1772  *      the lck_x_waiting bits...  since the wrapper
1773  *      routine has already changed the state atomically,
1774  *      we just need to decide if we should
1775  *      wake up anyone and what value to return... we do
1776  *      this by examining the state of the lock before
1777  *      we changed it
1778  */
1779 static lck_rw_type_t
1780 lck_rw_done_gen(
1781         lck_rw_t        *lck,
1782         uint32_t        prior_lock_state)
1783 {
1784         lck_rw_word_t   fake_lck;
1785         lck_rw_type_t   lock_type;
1786         thread_t                thread;
1787         uint32_t                rwlock_count;
1788
1789         /*
1790          * prior_lock state is a snapshot of the 1st word of the
1791          * lock in question... we'll fake up a pointer to it
1792          * and carefully not access anything beyond whats defined
1793          * in the first word of a lck_rw_t
1794          */
1795         fake_lck.data = prior_lock_state;
1796
1797         if (fake_lck.shared_count <= 1) {
1798                 if (fake_lck.w_waiting) {
1799                         thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1800                 }
1801
1802                 if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1803                         thread_wakeup(LCK_RW_READER_EVENT(lck));
1804                 }
1805         }
1806         if (fake_lck.shared_count) {
1807                 lock_type = LCK_RW_TYPE_SHARED;
1808         } else {
1809                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1810         }
1811
1812         /* Check if dropping the lock means that we need to unpromote */
1813         thread = current_thread();
1814         rwlock_count = thread->rwlock_count--;
1815 #if MACH_LDEBUG
1816         if (rwlock_count == 0) {
1817                 panic("rw lock count underflow for thread %p", thread);
1818         }
1819 #endif
1820         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1821                 /* sched_flags checked without lock, but will be rechecked while clearing */
1822                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1823         }
1824 #if CONFIG_DTRACE
1825         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1826 #endif
1827         return lock_type;
1828 }
1829
1830 /*
1831  *      Routine:        lck_rw_lock_shared_gen
1832  *      Function:
1833  *              Fast path code has determined that this lock
1834  *              is held exclusively... this is where we spin/block
1835  *              until we can acquire the lock in the shared mode
1836  */
1837 static void
1838 lck_rw_lock_shared_gen(
1839         lck_rw_t        *lck)
1840 {
1841         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1842         lck_rw_word_t           word;
1843         boolean_t               gotlock = 0;
1844         int                     slept = 0;
1845         wait_result_t           res = 0;
1846         boolean_t               istate;
1847
1848 #if     CONFIG_DTRACE
1849         uint64_t wait_interval = 0;
1850         int readers_at_sleep = 0;
1851         boolean_t dtrace_ls_initialized = FALSE;
1852         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1853 #endif /* CONFIG_DTRACE */
1854
1855         while (!lck_rw_grab(lck, LCK_RW_GRAB_SHARED, FALSE)) {
1856 #if     CONFIG_DTRACE
1857                 if (dtrace_ls_initialized == FALSE) {
1858                         dtrace_ls_initialized = TRUE;
1859                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1860                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1861                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1862                         if (dtrace_ls_enabled) {
1863                                 /*
1864                                  * Either sleeping or spinning is happening,
1865                                  *  start a timing of our delay interval now.
1866                                  */
1867                                 readers_at_sleep = lck->lck_rw_shared_count;
1868                                 wait_interval = mach_absolute_time();
1869                         }
1870                 }
1871 #endif
1872
1873                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1874                     trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1875
1876                 gotlock = lck_rw_grab(lck, LCK_RW_GRAB_SHARED, TRUE);
1877
1878                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1879                     trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, gotlock, 0);
1880
1881                 if (gotlock) {
1882                         break;
1883                 }
1884                 /*
1885                  * if we get here, the deadline has expired w/o us
1886                  * being able to grab the lock for read
1887                  * check to see if we're allowed to do a thread_block
1888                  */
1889                 if (lck->lck_rw_can_sleep) {
1890                         istate = lck_interlock_lock(lck);
1891
1892                         word.data = ordered_load_rw(lck);
1893                         if ((word.want_excl || word.want_upgrade) &&
1894                             ((word.shared_count == 0) || word.priv_excl)) {
1895                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1896                                     trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1897
1898                                 word.r_waiting = 1;
1899                                 ordered_store_rw(lck, word.data);
1900
1901                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1902                                 res = assert_wait(LCK_RW_READER_EVENT(lck),
1903                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1904                                 lck_interlock_unlock(lck, istate);
1905
1906                                 if (res == THREAD_WAITING) {
1907                                         res = thread_block(THREAD_CONTINUE_NULL);
1908                                         slept++;
1909                                 }
1910                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1911                                     trace_lck, res, slept, 0, 0);
1912                         } else {
1913                                 word.shared_count++;
1914                                 ordered_store_rw(lck, word.data);
1915                                 lck_interlock_unlock(lck, istate);
1916                                 break;
1917                         }
1918                 }
1919         }
1920
1921 #if     CONFIG_DTRACE
1922         if (dtrace_ls_enabled == TRUE) {
1923                 if (slept == 0) {
1924                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1925                 } else {
1926                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1927                             mach_absolute_time() - wait_interval, 0,
1928                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1929                 }
1930         }
1931         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1932 #endif  /* CONFIG_DTRACE */
1933 }
1934
1935
1936 void
1937 lck_rw_assert(
1938         lck_rw_t                *lck,
1939         unsigned int    type)
1940 {
1941         switch (type) {
1942         case LCK_RW_ASSERT_SHARED:
1943                 if ((lck->lck_rw_shared_count != 0) &&
1944                     (lck->lck_rw_owner == THREAD_NULL)) {
1945                         return;
1946                 }
1947                 break;
1948         case LCK_RW_ASSERT_EXCLUSIVE:
1949                 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1950                     (lck->lck_rw_shared_count == 0) &&
1951                     (lck->lck_rw_owner == current_thread())) {
1952                         return;
1953                 }
1954                 break;
1955         case LCK_RW_ASSERT_HELD:
1956                 if (lck->lck_rw_shared_count != 0) {
1957                         return;         // Held shared
1958                 }
1959                 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1960                     (lck->lck_rw_owner == current_thread())) {
1961                         return;         // Held exclusive
1962                 }
1963                 break;
1964         case LCK_RW_ASSERT_NOTHELD:
1965                 if ((lck->lck_rw_shared_count == 0) &&
1966                     !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
1967                     (lck->lck_rw_owner == THREAD_NULL)) {
1968                         return;
1969                 }
1970                 break;
1971         default:
1972                 break;
1973         }
1974         panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
1975 }
1976
1977
1978 /*
1979  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1980  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1981  */
1982 boolean_t
1983 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
1984 {
1985         if (not_in_kdp) {
1986                 panic("panic: rw lock exclusive check done outside of kernel debugger");
1987         }
1988         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
1989 }
1990
1991 /*
1992  * The C portion of the mutex package.  These routines are only invoked
1993  * if the optimized assembler routines can't do the work.
1994  */
1995
1996 /*
1997  * Forward declaration
1998  */
1999
2000 void
2001 lck_mtx_ext_init(
2002         lck_mtx_ext_t * lck,
2003         lck_grp_t * grp,
2004         lck_attr_t * attr);
2005
2006 /*
2007  *      Routine:        lck_mtx_alloc_init
2008  */
2009 lck_mtx_t      *
2010 lck_mtx_alloc_init(
2011         lck_grp_t * grp,
2012         lck_attr_t * attr)
2013 {
2014         lck_mtx_t      *lck;
2015
2016         if ((lck = (lck_mtx_t *) kalloc(sizeof(lck_mtx_t))) != 0) {
2017                 lck_mtx_init(lck, grp, attr);
2018         }
2019
2020         return lck;
2021 }
2022
2023 /*
2024  *      Routine:        lck_mtx_free
2025  */
2026 void
2027 lck_mtx_free(
2028         lck_mtx_t * lck,
2029         lck_grp_t * grp)
2030 {
2031         lck_mtx_destroy(lck, grp);
2032         kfree(lck, sizeof(lck_mtx_t));
2033 }
2034
2035 /*
2036  *      Routine:        lck_mtx_init
2037  */
2038 void
2039 lck_mtx_init(
2040         lck_mtx_t * lck,
2041         lck_grp_t * grp,
2042         lck_attr_t * attr)
2043 {
2044 #ifdef  BER_XXX
2045         lck_mtx_ext_t  *lck_ext;
2046 #endif
2047         lck_attr_t     *lck_attr;
2048
2049         if (attr != LCK_ATTR_NULL) {
2050                 lck_attr = attr;
2051         } else {
2052                 lck_attr = &LockDefaultLckAttr;
2053         }
2054
2055 #ifdef  BER_XXX
2056         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2057                 if ((lck_ext = (lck_mtx_ext_t *) kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2058                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
2059                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2060                         lck->lck_mtx_ptr = lck_ext;
2061                         lck->lck_mtx_type = LCK_MTX_TYPE;
2062                 }
2063         } else
2064 #endif
2065         {
2066                 lck->lck_mtx_ptr = NULL;                // Clear any padding in the union fields below
2067                 lck->lck_mtx_waiters = 0;
2068                 lck->lck_mtx_type = LCK_MTX_TYPE;
2069                 ordered_store_mtx(lck, 0);
2070         }
2071         lck_grp_reference(grp);
2072         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2073 }
2074
2075 /*
2076  *      Routine:        lck_mtx_init_ext
2077  */
2078 void
2079 lck_mtx_init_ext(
2080         lck_mtx_t * lck,
2081         lck_mtx_ext_t * lck_ext,
2082         lck_grp_t * grp,
2083         lck_attr_t * attr)
2084 {
2085         lck_attr_t     *lck_attr;
2086
2087         if (attr != LCK_ATTR_NULL) {
2088                 lck_attr = attr;
2089         } else {
2090                 lck_attr = &LockDefaultLckAttr;
2091         }
2092
2093         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2094                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2095                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2096                 lck->lck_mtx_ptr = lck_ext;
2097                 lck->lck_mtx_type = LCK_MTX_TYPE;
2098         } else {
2099                 lck->lck_mtx_waiters = 0;
2100                 lck->lck_mtx_type = LCK_MTX_TYPE;
2101                 ordered_store_mtx(lck, 0);
2102         }
2103         lck_grp_reference(grp);
2104         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2105 }
2106
2107 /*
2108  *      Routine:        lck_mtx_ext_init
2109  */
2110 void
2111 lck_mtx_ext_init(
2112         lck_mtx_ext_t * lck,
2113         lck_grp_t * grp,
2114         lck_attr_t * attr)
2115 {
2116         bzero((void *) lck, sizeof(lck_mtx_ext_t));
2117
2118         lck->lck_mtx.lck_mtx_type = LCK_MTX_TYPE;
2119
2120         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2121                 lck->lck_mtx_deb.type = MUTEX_TAG;
2122                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2123         }
2124         lck->lck_mtx_grp = grp;
2125
2126         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
2127                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2128         }
2129 }
2130
2131 /* The slow versions */
2132 static void lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2133 static boolean_t lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread);
2134 static void lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2135
2136 /* The adaptive spin function */
2137 static spinwait_result_t lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2138
2139 /*
2140  *      Routine:        lck_mtx_verify
2141  *
2142  *      Verify if a mutex is valid
2143  */
2144 static inline void
2145 lck_mtx_verify(lck_mtx_t *lock)
2146 {
2147         if (lock->lck_mtx_type != LCK_MTX_TYPE) {
2148                 panic("Invalid mutex %p", lock);
2149         }
2150 #if     DEVELOPMENT || DEBUG
2151         if (lock->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2152                 panic("Mutex destroyed %p", lock);
2153         }
2154 #endif  /* DEVELOPMENT || DEBUG */
2155 }
2156
2157 /*
2158  *      Routine:        lck_mtx_check_preemption
2159  *
2160  *      Verify preemption is enabled when attempting to acquire a mutex.
2161  */
2162
2163 static inline void
2164 lck_mtx_check_preemption(lck_mtx_t *lock)
2165 {
2166 #if     DEVELOPMENT || DEBUG
2167         int pl = get_preemption_level();
2168
2169         if (pl != 0) {
2170                 panic("Attempt to take mutex with preemption disabled. Lock=%p, level=%d", lock, pl);
2171         }
2172 #else
2173         (void)lock;
2174 #endif
2175 }
2176
2177 /*
2178  *      Routine:        lck_mtx_lock
2179  */
2180 void
2181 lck_mtx_lock(lck_mtx_t *lock)
2182 {
2183         thread_t        thread;
2184
2185         lck_mtx_verify(lock);
2186         lck_mtx_check_preemption(lock);
2187         thread = current_thread();
2188         if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2189             0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2190 #if     CONFIG_DTRACE
2191                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2192 #endif /* CONFIG_DTRACE */
2193                 return;
2194         }
2195         lck_mtx_lock_contended(lock, thread, FALSE);
2196 }
2197
2198 /*
2199  *       This is the slow version of mutex locking.
2200  */
2201 static void NOINLINE
2202 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2203 {
2204         thread_t                holding_thread;
2205         uintptr_t               state;
2206         int                     waiters = 0;
2207         spinwait_result_t       sw_res;
2208         struct turnstile        *ts = NULL;
2209
2210         /* Loop waiting until I see that the mutex is unowned */
2211         for (;;) {
2212                 sw_res = lck_mtx_lock_contended_spinwait_arm(lock, thread, interlocked);
2213                 interlocked = FALSE;
2214
2215                 switch (sw_res) {
2216                 case SPINWAIT_ACQUIRED:
2217                         if (ts != NULL) {
2218                                 interlock_lock(lock);
2219                                 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2220                                 interlock_unlock(lock);
2221                         }
2222                         goto done;
2223                 case SPINWAIT_INTERLOCK:
2224                         goto set_owner;
2225                 default:
2226                         break;
2227                 }
2228
2229                 state = ordered_load_mtx(lock);
2230                 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2231                 if (holding_thread == NULL) {
2232                         break;
2233                 }
2234                 ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
2235                 lck_mtx_lock_wait(lock, holding_thread, &ts);
2236                 /* returns interlock unlocked */
2237         }
2238
2239 set_owner:
2240         /* Hooray, I'm the new owner! */
2241         state = ordered_load_mtx(lock);
2242
2243         if (state & ARM_LCK_WAITERS) {
2244                 /* Skip lck_mtx_lock_acquire if there are no waiters. */
2245                 waiters = lck_mtx_lock_acquire(lock, ts);
2246                 /*
2247                  * lck_mtx_lock_acquire will call
2248                  * turnstile_complete
2249                  */
2250         } else {
2251                 if (ts != NULL) {
2252                         turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2253                 }
2254         }
2255
2256         state = LCK_MTX_THREAD_TO_STATE(thread);
2257         if (waiters != 0) {
2258                 state |= ARM_LCK_WAITERS;
2259         }
2260 #if __SMP__
2261         state |= LCK_ILOCK;                             // Preserve interlock
2262         ordered_store_mtx(lock, state); // Set ownership
2263         interlock_unlock(lock);                 // Release interlock, enable preemption
2264 #else
2265         ordered_store_mtx(lock, state); // Set ownership
2266         enable_preemption();
2267 #endif
2268
2269 done:
2270         load_memory_barrier();
2271
2272         assert(thread->turnstile != NULL);
2273
2274         if (ts != NULL) {
2275                 turnstile_cleanup();
2276         }
2277
2278 #if CONFIG_DTRACE
2279         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2280 #endif /* CONFIG_DTRACE */
2281 }
2282
2283 /*
2284  * Routine: lck_mtx_lock_spinwait_arm
2285  *
2286  * Invoked trying to acquire a mutex when there is contention but
2287  * the holder is running on another processor. We spin for up to a maximum
2288  * time waiting for the lock to be released.
2289  */
2290 static spinwait_result_t
2291 lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2292 {
2293         int                     has_interlock = (int)interlocked;
2294 #if __SMP__
2295         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
2296         thread_t                holder;
2297         uint64_t                overall_deadline;
2298         uint64_t                check_owner_deadline;
2299         uint64_t                cur_time;
2300         spinwait_result_t       retval = SPINWAIT_DID_SPIN;
2301         int                     loopcount = 0;
2302         uintptr_t               state;
2303         boolean_t               istate;
2304
2305         if (__improbable(!(lck_mtx_adaptive_spin_mode & ADAPTIVE_SPIN_ENABLE))) {
2306                 if (!has_interlock) {
2307                         interlock_lock(lock);
2308                 }
2309
2310                 return SPINWAIT_DID_NOT_SPIN;
2311         }
2312
2313         state = ordered_load_mtx(lock);
2314
2315         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2316             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, 0, 0);
2317
2318         cur_time = mach_absolute_time();
2319         overall_deadline = cur_time + MutexSpin;
2320         check_owner_deadline = cur_time;
2321
2322         if (has_interlock) {
2323                 istate = ml_get_interrupts_enabled();
2324         }
2325
2326         /* Snoop the lock state */
2327         state = ordered_load_mtx(lock);
2328
2329         /*
2330          * Spin while:
2331          *   - mutex is locked, and
2332          *   - it's locked as a spin lock, and
2333          *   - owner is running on another processor, and
2334          *   - owner (processor) is not idling, and
2335          *   - we haven't spun for long enough.
2336          */
2337         do {
2338                 if (!(state & LCK_ILOCK) || has_interlock) {
2339                         if (!has_interlock) {
2340                                 has_interlock = interlock_try_disable_interrupts(lock, &istate);
2341                         }
2342
2343                         if (has_interlock) {
2344                                 state = ordered_load_mtx(lock);
2345                                 holder = LCK_MTX_STATE_TO_THREAD(state);
2346
2347                                 if (holder == NULL) {
2348                                         retval = SPINWAIT_INTERLOCK;
2349
2350                                         if (istate) {
2351                                                 ml_set_interrupts_enabled(istate);
2352                                         }
2353
2354                                         break;
2355                                 }
2356
2357                                 if (!(holder->machine.machine_thread_flags & MACHINE_THREAD_FLAGS_ON_CPU) ||
2358                                     (holder->state & TH_IDLE)) {
2359                                         if (loopcount == 0) {
2360                                                 retval = SPINWAIT_DID_NOT_SPIN;
2361                                         }
2362
2363                                         if (istate) {
2364                                                 ml_set_interrupts_enabled(istate);
2365                                         }
2366
2367                                         break;
2368                                 }
2369
2370                                 interlock_unlock_enable_interrupts(lock, istate);
2371                                 has_interlock = 0;
2372                         }
2373                 }
2374
2375                 cur_time = mach_absolute_time();
2376
2377                 if (cur_time >= overall_deadline) {
2378                         break;
2379                 }
2380
2381                 check_owner_deadline = cur_time + (MutexSpin / SPINWAIT_OWNER_CHECK_COUNT);
2382
2383                 if (cur_time < check_owner_deadline) {
2384                         machine_delay_until(check_owner_deadline - cur_time, check_owner_deadline);
2385                 }
2386
2387                 /* Snoop the lock state */
2388                 state = ordered_load_mtx(lock);
2389
2390                 if (state == 0) {
2391                         /* Try to grab the lock. */
2392                         if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2393                             0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2394                                 retval = SPINWAIT_ACQUIRED;
2395                                 break;
2396                         }
2397                 }
2398
2399                 loopcount++;
2400         } while (TRUE);
2401
2402 #if     CONFIG_DTRACE
2403         /*
2404          * We've already kept a count via overall_deadline of how long we spun.
2405          * If dtrace is active, then we compute backwards to decide how
2406          * long we spun.
2407          *
2408          * Note that we record a different probe id depending on whether
2409          * this is a direct or indirect mutex.  This allows us to
2410          * penalize only lock groups that have debug/stats enabled
2411          * with dtrace processing if desired.
2412          */
2413         if (__probable(lock->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)) {
2414                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lock,
2415                     mach_absolute_time() - (overall_deadline - MutexSpin));
2416         } else {
2417                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lock,
2418                     mach_absolute_time() - (overall_deadline - MutexSpin));
2419         }
2420         /* The lockstat acquire event is recorded by the caller. */
2421 #endif
2422
2423         state = ordered_load_mtx(lock);
2424
2425         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2426             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, retval, 0);
2427 #else /* __SMP__ */
2428         /* Spinwaiting is not useful on UP systems. */
2429 #pragma unused(lock, thread)
2430         int retval = SPINWAIT_DID_NOT_SPIN;
2431 #endif /* __SMP__ */
2432         if ((!has_interlock) && (retval != SPINWAIT_ACQUIRED)) {
2433                 /* We must own either the lock or the interlock on return. */
2434                 interlock_lock(lock);
2435         }
2436
2437         return retval;
2438 }
2439
2440 /*
2441  *      Common code for mutex locking as spinlock
2442  */
2443 static inline void
2444 lck_mtx_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2445 {
2446         uintptr_t       state;
2447
2448         interlock_lock(lock);
2449         state = ordered_load_mtx(lock);
2450         if (LCK_MTX_STATE_TO_THREAD(state)) {
2451                 if (allow_held_as_mutex) {
2452                         lck_mtx_lock_contended(lock, current_thread(), TRUE);
2453                 } else {
2454                         // "Always" variants can never block. If the lock is held and blocking is not allowed
2455                         // then someone is mixing always and non-always calls on the same lock, which is
2456                         // forbidden.
2457                         panic("Attempting to block on a lock taken as spin-always %p", lock);
2458                 }
2459                 return;
2460         }
2461         state &= ARM_LCK_WAITERS;                                               // Preserve waiters bit
2462         state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK);        // Add spin tag and maintain interlock
2463         ordered_store_mtx(lock, state);
2464         load_memory_barrier();
2465
2466 #if     CONFIG_DTRACE
2467         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
2468 #endif /* CONFIG_DTRACE */
2469 }
2470
2471 /*
2472  *      Routine:        lck_mtx_lock_spin
2473  */
2474 void
2475 lck_mtx_lock_spin(lck_mtx_t *lock)
2476 {
2477         lck_mtx_check_preemption(lock);
2478         lck_mtx_lock_spin_internal(lock, TRUE);
2479 }
2480
2481 /*
2482  *      Routine:        lck_mtx_lock_spin_always
2483  */
2484 void
2485 lck_mtx_lock_spin_always(lck_mtx_t *lock)
2486 {
2487         lck_mtx_lock_spin_internal(lock, FALSE);
2488 }
2489
2490 /*
2491  *      Routine:        lck_mtx_try_lock
2492  */
2493 boolean_t
2494 lck_mtx_try_lock(lck_mtx_t *lock)
2495 {
2496         thread_t        thread = current_thread();
2497
2498         lck_mtx_verify(lock);
2499         if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2500             0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
2501 #if     CONFIG_DTRACE
2502                 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
2503 #endif /* CONFIG_DTRACE */
2504                 return TRUE;
2505         }
2506         return lck_mtx_try_lock_contended(lock, thread);
2507 }
2508
2509 static boolean_t NOINLINE
2510 lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
2511 {
2512         thread_t        holding_thread;
2513         uintptr_t       state;
2514         int             waiters;
2515
2516 #if     __SMP__
2517         interlock_lock(lock);
2518         state = ordered_load_mtx(lock);
2519         holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2520         if (holding_thread) {
2521                 interlock_unlock(lock);
2522                 return FALSE;
2523         }
2524 #else
2525         disable_preemption_for_thread(thread);
2526         state = ordered_load_mtx(lock);
2527         if (state & LCK_ILOCK) {
2528                 panic("Unexpected interlock set (%p)", lock);
2529         }
2530         holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2531         if (holding_thread) {
2532                 enable_preemption();
2533                 return FALSE;
2534         }
2535         state |= LCK_ILOCK;
2536         ordered_store_mtx(lock, state);
2537 #endif  // __SMP__
2538         waiters = lck_mtx_lock_acquire(lock, NULL);
2539         state = LCK_MTX_THREAD_TO_STATE(thread);
2540         if (waiters != 0) {
2541                 state |= ARM_LCK_WAITERS;
2542         }
2543 #if __SMP__
2544         state |= LCK_ILOCK;                             // Preserve interlock
2545         ordered_store_mtx(lock, state); // Set ownership
2546         interlock_unlock(lock);                 // Release interlock, enable preemption
2547 #else
2548         ordered_store_mtx(lock, state); // Set ownership
2549         enable_preemption();
2550 #endif
2551         load_memory_barrier();
2552
2553         turnstile_cleanup();
2554
2555         return TRUE;
2556 }
2557
2558 static inline boolean_t
2559 lck_mtx_try_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2560 {
2561         uintptr_t       state;
2562
2563         if (!interlock_try(lock)) {
2564                 return FALSE;
2565         }
2566         state = ordered_load_mtx(lock);
2567         if (LCK_MTX_STATE_TO_THREAD(state)) {
2568                 // Lock is held as mutex
2569                 if (allow_held_as_mutex) {
2570                         interlock_unlock(lock);
2571                 } else {
2572                         // "Always" variants can never block. If the lock is held as a normal mutex
2573                         // then someone is mixing always and non-always calls on the same lock, which is
2574                         // forbidden.
2575                         panic("Spin-mutex held as full mutex %p", lock);
2576                 }
2577                 return FALSE;
2578         }
2579         state &= ARM_LCK_WAITERS;                                               // Preserve waiters bit
2580         state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK);        // Add spin tag and maintain interlock
2581         ordered_store_mtx(lock, state);
2582         load_memory_barrier();
2583
2584 #if     CONFIG_DTRACE
2585         LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
2586 #endif /* CONFIG_DTRACE */
2587         return TRUE;
2588 }
2589
2590 /*
2591  *      Routine: lck_mtx_try_lock_spin
2592  */
2593 boolean_t
2594 lck_mtx_try_lock_spin(lck_mtx_t *lock)
2595 {
2596         return lck_mtx_try_lock_spin_internal(lock, TRUE);
2597 }
2598
2599 /*
2600  *      Routine: lck_mtx_try_lock_spin_always
2601  */
2602 boolean_t
2603 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
2604 {
2605         return lck_mtx_try_lock_spin_internal(lock, FALSE);
2606 }
2607
2608
2609
2610 /*
2611  *      Routine:        lck_mtx_unlock
2612  */
2613 void
2614 lck_mtx_unlock(lck_mtx_t *lock)
2615 {
2616         thread_t        thread = current_thread();
2617         uintptr_t       state;
2618         boolean_t       ilk_held = FALSE;
2619
2620         lck_mtx_verify(lock);
2621
2622         state = ordered_load_mtx(lock);
2623         if (state & LCK_ILOCK) {
2624                 if (LCK_MTX_STATE_TO_THREAD(state) == (thread_t)LCK_MTX_SPIN_TAG) {
2625                         ilk_held = TRUE;        // Interlock is held by (presumably) this thread
2626                 }
2627                 goto slow_case;
2628         }
2629         // Locked as a mutex
2630         if (os_atomic_cmpxchg(&lock->lck_mtx_data,
2631             LCK_MTX_THREAD_TO_STATE(thread), 0, release)) {
2632 #if     CONFIG_DTRACE
2633                 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2634 #endif /* CONFIG_DTRACE */
2635                 return;
2636         }
2637 slow_case:
2638         lck_mtx_unlock_contended(lock, thread, ilk_held);
2639 }
2640
2641 static void NOINLINE
2642 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
2643 {
2644         uintptr_t       state;
2645         boolean_t               cleanup = FALSE;
2646
2647         if (ilk_held) {
2648                 state = ordered_load_mtx(lock);
2649         } else {
2650 #if     __SMP__
2651                 interlock_lock(lock);
2652                 state = ordered_load_mtx(lock);
2653                 if (thread != LCK_MTX_STATE_TO_THREAD(state)) {
2654                         panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
2655                 }
2656 #else
2657                 disable_preemption_for_thread(thread);
2658                 state = ordered_load_mtx(lock);
2659                 if (state & LCK_ILOCK) {
2660                         panic("lck_mtx_unlock(): Unexpected interlock set (%p)", lock);
2661                 }
2662                 if (thread != LCK_MTX_STATE_TO_THREAD(state)) {
2663                         panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
2664                 }
2665                 state |= LCK_ILOCK;
2666                 ordered_store_mtx(lock, state);
2667 #endif
2668                 if (state & ARM_LCK_WAITERS) {
2669                         if (lck_mtx_unlock_wakeup(lock, thread)) {
2670                                 state = ARM_LCK_WAITERS;
2671                         } else {
2672                                 state = 0;
2673                         }
2674                         cleanup = TRUE;
2675                         goto unlock;
2676                 }
2677         }
2678         state &= ARM_LCK_WAITERS;   /* Clear state, retain waiters bit */
2679 unlock:
2680 #if __SMP__
2681         state |= LCK_ILOCK;
2682         ordered_store_mtx(lock, state);
2683         interlock_unlock(lock);
2684 #else
2685         ordered_store_mtx(lock, state);
2686         enable_preemption();
2687 #endif
2688         if (cleanup) {
2689                 /*
2690                  * Do not do any turnstile operations outside of this block.
2691                  * lock/unlock is called at early stage of boot with single thread,
2692                  * when turnstile is not yet initialized.
2693                  * Even without contention we can come throught the slow path
2694                  * if the mutex is acquired as a spin lock.
2695                  */
2696                 turnstile_cleanup();
2697         }
2698
2699 #if     CONFIG_DTRACE
2700         LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2701 #endif /* CONFIG_DTRACE */
2702 }
2703
2704 /*
2705  *      Routine:        lck_mtx_assert
2706  */
2707 void
2708 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
2709 {
2710         thread_t        thread, holder;
2711         uintptr_t       state;
2712
2713         state = ordered_load_mtx(lock);
2714         holder = LCK_MTX_STATE_TO_THREAD(state);
2715         if (holder == (thread_t)LCK_MTX_SPIN_TAG) {
2716                 // Lock is held in spin mode, owner is unknown.
2717                 return; // Punt
2718         }
2719         thread = current_thread();
2720         if (type == LCK_MTX_ASSERT_OWNED) {
2721                 if (thread != holder) {
2722                         panic("lck_mtx_assert(): mutex (%p) owned", lock);
2723                 }
2724         } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
2725                 if (thread == holder) {
2726                         panic("lck_mtx_assert(): mutex (%p) not owned", lock);
2727                 }
2728         } else {
2729                 panic("lck_mtx_assert(): invalid arg (%u)", type);
2730         }
2731 }
2732
2733 /*
2734  *      Routine:        lck_mtx_ilk_unlock
2735  */
2736 boolean_t
2737 lck_mtx_ilk_unlock(lck_mtx_t *lock)
2738 {
2739         interlock_unlock(lock);
2740         return TRUE;
2741 }
2742
2743 /*
2744  *      Routine:        lck_mtx_convert_spin
2745  *
2746  *      Convert a mutex held for spin into a held full mutex
2747  */
2748 void
2749 lck_mtx_convert_spin(lck_mtx_t *lock)
2750 {
2751         thread_t        thread = current_thread();
2752         uintptr_t       state;
2753         int                     waiters;
2754
2755         state = ordered_load_mtx(lock);
2756         if (LCK_MTX_STATE_TO_THREAD(state) == thread) {
2757                 return;         // Already owned as mutex, return
2758         }
2759         if ((state & LCK_ILOCK) == 0 || (LCK_MTX_STATE_TO_THREAD(state) != (thread_t)LCK_MTX_SPIN_TAG)) {
2760                 panic("lck_mtx_convert_spin: Not held as spinlock (%p)", lock);
2761         }
2762         state &= ~(LCK_MTX_THREAD_MASK);                // Clear the spin tag
2763         ordered_store_mtx(lock, state);
2764         waiters = lck_mtx_lock_acquire(lock, NULL);   // Acquire to manage priority boosts
2765         state = LCK_MTX_THREAD_TO_STATE(thread);
2766         if (waiters != 0) {
2767                 state |= ARM_LCK_WAITERS;
2768         }
2769 #if __SMP__
2770         state |= LCK_ILOCK;
2771         ordered_store_mtx(lock, state);                 // Set ownership
2772         interlock_unlock(lock);                                 // Release interlock, enable preemption
2773 #else
2774         ordered_store_mtx(lock, state);                 // Set ownership
2775         enable_preemption();
2776 #endif
2777         turnstile_cleanup();
2778 }
2779
2780
2781 /*
2782  *      Routine:        lck_mtx_destroy
2783  */
2784 void
2785 lck_mtx_destroy(
2786         lck_mtx_t * lck,
2787         lck_grp_t * grp)
2788 {
2789         if (lck->lck_mtx_type != LCK_MTX_TYPE) {
2790                 panic("Destroying invalid mutex %p", lck);
2791         }
2792         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2793                 panic("Destroying previously destroyed lock %p", lck);
2794         }
2795         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2796         lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
2797         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2798         lck_grp_deallocate(grp);
2799         return;
2800 }
2801
2802 /*
2803  *      Routine:        lck_spin_assert
2804  */
2805 void
2806 lck_spin_assert(lck_spin_t *lock, unsigned int type)
2807 {
2808         thread_t        thread, holder;
2809         uintptr_t       state;
2810
2811         if (lock->type != LCK_SPIN_TYPE) {
2812                 panic("Invalid spinlock %p", lock);
2813         }
2814
2815         state = lock->lck_spin_data;
2816         holder = (thread_t)(state & ~LCK_ILOCK);
2817         thread = current_thread();
2818         if (type == LCK_ASSERT_OWNED) {
2819                 if (holder == 0) {
2820                         panic("Lock not owned %p = %lx", lock, state);
2821                 }
2822                 if (holder != thread) {
2823                         panic("Lock not owned by current thread %p = %lx", lock, state);
2824                 }
2825                 if ((state & LCK_ILOCK) == 0) {
2826                         panic("Lock bit not set %p = %lx", lock, state);
2827                 }
2828         } else if (type == LCK_ASSERT_NOTOWNED) {
2829                 if (holder != 0) {
2830                         if (holder == thread) {
2831                                 panic("Lock owned by current thread %p = %lx", lock, state);
2832                         }
2833                 }
2834         } else {
2835                 panic("lck_spin_assert(): invalid arg (%u)", type);
2836         }
2837 }
2838
2839 boolean_t
2840 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2841 {
2842         lck_rw_word_t   word;
2843
2844         lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2845
2846         word.data = ordered_load_rw(lck);
2847         if (word.want_excl || word.want_upgrade || force_yield) {
2848                 lck_rw_unlock_shared(lck);
2849                 mutex_pause(2);
2850                 lck_rw_lock_shared(lck);
2851                 return TRUE;
2852         }
2853
2854         return FALSE;
2855 }
2856
2857 /*
2858  * Routine: kdp_lck_mtx_lock_spin_is_acquired
2859  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2860  */
2861 boolean_t
2862 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
2863 {
2864         uintptr_t       state;
2865
2866         if (not_in_kdp) {
2867                 panic("panic: spinlock acquired check done outside of kernel debugger");
2868         }
2869         state = ordered_load_mtx(lck);
2870         if (state == LCK_MTX_TAG_DESTROYED) {
2871                 return FALSE;
2872         }
2873         if (LCK_MTX_STATE_TO_THREAD(state) || (state & LCK_ILOCK)) {
2874                 return TRUE;
2875         }
2876         return FALSE;
2877 }
2878
2879 void
2880 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2881 {
2882         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
2883         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2884         uintptr_t state   = ordered_load_mtx(mutex);
2885         thread_t holder   = LCK_MTX_STATE_TO_THREAD(state);
2886         if ((uintptr_t)holder == (uintptr_t)LCK_MTX_SPIN_TAG) {
2887                 waitinfo->owner = STACKSHOT_WAITOWNER_MTXSPIN;
2888         } else {
2889                 assertf(state != (uintptr_t)LCK_MTX_TAG_DESTROYED, "state=0x%llx", (uint64_t)state);
2890                 assertf(state != (uintptr_t)LCK_MTX_TAG_INDIRECT, "state=0x%llx", (uint64_t)state);
2891                 waitinfo->owner = thread_tid(holder);
2892         }
2893 }
2894
2895 void
2896 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2897 {
2898         lck_rw_t        *rwlck = NULL;
2899         switch (waitinfo->wait_type) {
2900         case kThreadWaitKernelRWLockRead:
2901                 rwlck = READ_EVENT_TO_RWLOCK(event);
2902                 break;
2903         case kThreadWaitKernelRWLockWrite:
2904         case kThreadWaitKernelRWLockUpgrade:
2905                 rwlck = WRITE_EVENT_TO_RWLOCK(event);
2906                 break;
2907         default:
2908                 panic("%s was called with an invalid blocking type", __FUNCTION__);
2909                 break;
2910         }
2911         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2912         waitinfo->owner = thread_tid(rwlck->lck_rw_owner);
2913 }