osfmk/arm/locks_arm.c

   1 /*
   2  * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System Copyright (c) 1991,1990,1989,1988,1987 Carnegie
  33  * Mellon University All Rights Reserved.
  34  *
  35  * Permission to use, copy, modify and distribute this software and its
  36  * documentation is hereby granted, provided that both the copyright notice
  37  * and this permission notice appear in all copies of the software,
  38  * derivative works or modified versions, and any portions thereof, and that
  39  * both notices appear in supporting documentation.
  40  *
  41  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.
  42  * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES
  43  * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  44  *
  45  * Carnegie Mellon requests users of this software to return to
  46  *
  47  * Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  48  * School of Computer Science Carnegie Mellon University Pittsburgh PA
  49  * 15213-3890
  50  *
  51  * any improvements or extensions that they make and grant Carnegie Mellon the
  52  * rights to redistribute these changes.
  53  */
  54 /*
  55  *      File:   kern/lock.c
  56  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  57  *      Date:   1985
  58  *
  59  *      Locking primitives implementation
  60  */
  61
  62 #define ATOMIC_PRIVATE 1
  63 #define LOCK_PRIVATE 1
  64
  65 #include <mach_ldebug.h>
  66
  67 #include <kern/kalloc.h>
  68 #include <kern/locks.h>
  69 #include <kern/misc_protos.h>
  70 #include <kern/thread.h>
  71 #include <kern/processor.h>
  72 #include <kern/sched_prim.h>
  73 #include <kern/xpr.h>
  74 #include <kern/debug.h>
  75 #include <kern/kcdata.h>
  76 #include <string.h>
  77
  78 #include <arm/cpu_data_internal.h>
  79 #include <arm/proc_reg.h>
  80 #include <arm/smp.h>
  81 #include <machine/atomic.h>
  82 #include <machine/machine_cpu.h>
  83
  84 #include <sys/kdebug.h>
  85
  86 /*
  87  * We need only enough declarations from the BSD-side to be able to
  88  * test if our probe is active, and to call __dtrace_probe().  Setting
  89  * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
  90  */
  91 #if     CONFIG_DTRACE
  92 #define NEED_DTRACE_DEFS
  93 #include <../bsd/sys/lockstat.h>
  94
  95 #define DTRACE_RW_SHARED        0x0     //reader
  96 #define DTRACE_RW_EXCL          0x1     //writer
  97 #define DTRACE_NO_FLAG          0x0     //not applicable
  98
  99 #endif  /* CONFIG_DTRACE */
 100
 101 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
 102 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
 103 #define LCK_RW_LCK_SHARED_CODE          0x102
 104 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
 105 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
 106 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 107
 108
 109 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 110
 111 // Panic in tests that check lock usage correctness
 112 // These are undesirable when in a panic or a debugger is runnning.
 113 #define LOCK_CORRECTNESS_PANIC() (kernel_debugger_entry_count == 0)
 114
 115 unsigned int    LcksOpts = 0;
 116
 117 #if CONFIG_DTRACE && __SMP__
 118 extern uint64_t dtrace_spin_threshold;
 119 #endif
 120
 121 /* Forwards */
 122
 123
 124 #if     USLOCK_DEBUG
 125 /*
 126  *      Perform simple lock checks.
 127  */
 128 int             uslock_check = 1;
 129 int             max_lock_loops = 100000000;
 130 decl_simple_lock_data(extern, printf_lock)
 131 decl_simple_lock_data(extern, panic_lock)
 132 #endif                          /* USLOCK_DEBUG */
 133
 134 extern unsigned int not_in_kdp;
 135
 136 /*
 137  *      We often want to know the addresses of the callers
 138  *      of the various lock routines.  However, this information
 139  *      is only used for debugging and statistics.
 140  */
 141 typedef void   *pc_t;
 142 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 143 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 144
 145 #ifdef  lint
 146 /*
 147  *      Eliminate lint complaints about unused local pc variables.
 148  */
 149 #define OBTAIN_PC(pc,l) ++pc
 150 #else                           /* lint */
 151 #define OBTAIN_PC(pc,l)
 152 #endif                          /* lint */
 153
 154
 155 /*
 156  *      Portable lock package implementation of usimple_locks.
 157  */
 158
 159 #if     USLOCK_DEBUG
 160 #define USLDBG(stmt)    stmt
 161         void            usld_lock_init(usimple_lock_t, unsigned short);
 162         void            usld_lock_pre(usimple_lock_t, pc_t);
 163         void            usld_lock_post(usimple_lock_t, pc_t);
 164         void            usld_unlock(usimple_lock_t, pc_t);
 165         void            usld_lock_try_pre(usimple_lock_t, pc_t);
 166         void            usld_lock_try_post(usimple_lock_t, pc_t);
 167         int             usld_lock_common_checks(usimple_lock_t, const char *);
 168 #else                           /* USLOCK_DEBUG */
 169 #define USLDBG(stmt)
 170 #endif                          /* USLOCK_DEBUG */
 171
 172 /*
 173  * Owner thread pointer when lock held in spin mode
 174  */
 175 #define LCK_MTX_SPIN_TAG  0xfffffff0
 176
 177
 178 #define interlock_lock(lock)    hw_lock_bit    ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
 179 #define interlock_try(lock)             hw_lock_bit_try((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
 180 #define interlock_unlock(lock)  hw_unlock_bit  ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
 181 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
 182 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
 183
 184 #define memory_barrier()        __c11_atomic_thread_fence(memory_order_acq_rel_smp)
 185 #define load_memory_barrier()   __c11_atomic_thread_fence(memory_order_acquire_smp)
 186 #define store_memory_barrier()  __c11_atomic_thread_fence(memory_order_release_smp)
 187
 188 // Enforce program order of loads and stores.
 189 #define ordered_load(target, type) \
 190                 __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed)
 191 #define ordered_store(target, type, value) \
 192                 __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed)
 193
 194 #define ordered_load_mtx(lock)                  ordered_load(&(lock)->lck_mtx_data, uintptr_t)
 195 #define ordered_store_mtx(lock, value)  ordered_store(&(lock)->lck_mtx_data, uintptr_t, (value))
 196 #define ordered_load_rw(lock)                   ordered_load(&(lock)->lck_rw_data, uint32_t)
 197 #define ordered_store_rw(lock, value)   ordered_store(&(lock)->lck_rw_data, uint32_t, (value))
 198 #define ordered_load_rw_owner(lock)             ordered_load(&(lock)->lck_rw_owner, thread_t)
 199 #define ordered_store_rw_owner(lock, value)     ordered_store(&(lock)->lck_rw_owner, thread_t, (value))
 200 #define ordered_load_hw(lock)                   ordered_load(&(lock)->lock_data, uintptr_t)
 201 #define ordered_store_hw(lock, value)   ordered_store(&(lock)->lock_data, uintptr_t, (value))
 202 #define ordered_load_bit(lock)                  ordered_load((lock), uint32_t)
 203 #define ordered_store_bit(lock, value)  ordered_store((lock), uint32_t, (value))
 204
 205
 206 // Prevent the compiler from reordering memory operations around this
 207 #define compiler_memory_fence() __asm__ volatile ("" ::: "memory")
 208
 209 #define LOCK_PANIC_TIMEOUT      0xc00000
 210 #define NOINLINE                __attribute__((noinline))
 211
 212
 213 #if __arm__
 214 #define interrupts_disabled(mask) (mask & PSR_INTMASK)
 215 #else
 216 #define interrupts_disabled(mask) (mask & DAIF_IRQF)
 217 #endif
 218
 219
 220 #if __arm__
 221 #define enable_fiq()            __asm__ volatile ("cpsie  f" ::: "memory");
 222 #define enable_interrupts()     __asm__ volatile ("cpsie if" ::: "memory");
 223 #endif
 224
 225 /*
 226  * Forward declarations
 227  */
 228
 229 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
 230 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
 231 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 232 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 233 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 234 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 235 void lck_rw_clear_promotions_x86(thread_t thread);
 236 static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait);
 237
 238 /*
 239  * atomic exchange API is a low level abstraction of the operations
 240  * to atomically read, modify, and write a pointer.  This abstraction works
 241  * for both Intel and ARMv8.1 compare and exchange atomic instructions as
 242  * well as the ARM exclusive instructions.
 243  *
 244  * atomic_exchange_begin() - begin exchange and retrieve current value
 245  * atomic_exchange_complete() - conclude an exchange
 246  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
 247  */
 248 static uint32_t
 249 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 250 {
 251         uint32_t        val;
 252
 253         val = load_exclusive32(target, ord);
 254         *previous = val;
 255         return val;
 256 }
 257
 258 static boolean_t
 259 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 260 {
 261         (void)previous;         // Previous not needed, monitor is held
 262         return store_exclusive32(target, newval, ord);
 263 }
 264
 265 static void
 266 atomic_exchange_abort(void)
 267 {
 268         clear_exclusive();
 269 }
 270
 271 static boolean_t
 272 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 273 {
 274         uint32_t                value, prev;
 275
 276         for ( ; ; ) {
 277                 value = atomic_exchange_begin32(target, &prev, ord);
 278                 if (value & test_mask) {
 279                         if (wait)
 280                                 wait_for_event();       // Wait with monitor held
 281                         else
 282                                 atomic_exchange_abort();        // Clear exclusive monitor
 283                         return FALSE;
 284                 }
 285                 value |= set_mask;
 286                 if (atomic_exchange_complete32(target, prev, value, ord))
 287                         return TRUE;
 288         }
 289 }
 290
 291 void _disable_preemption(void)
 292 {
 293         thread_t        thread = current_thread();
 294         unsigned int    count;
 295
 296         count = thread->machine.preemption_count + 1;
 297         ordered_store(&thread->machine.preemption_count, unsigned int, count);
 298 }
 299
 300 void _enable_preemption(void)
 301 {
 302         thread_t        thread = current_thread();
 303         long            state;
 304         unsigned int    count;
 305 #if __arm__
 306 #define INTERRUPT_MASK PSR_IRQF
 307 #else   // __arm__
 308 #define INTERRUPT_MASK DAIF_IRQF
 309 #endif  // __arm__
 310
 311         count = thread->machine.preemption_count;
 312         if (count == 0)
 313                 panic("Preemption count negative");     // Count will go negative when released
 314         count--;
 315         if (count > 0)
 316                 goto update_count;                      // Preemption is still disabled, just update
 317         state = get_interrupts();                       // Get interrupt state
 318         if (state & INTERRUPT_MASK)
 319                 goto update_count;                      // Interrupts are already masked, can't take AST here
 320
 321         disable_interrupts_noread();                    // Disable interrupts
 322         ordered_store(&thread->machine.preemption_count, unsigned int, count);
 323         if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
 324 #if __arm__
 325 #if __ARM_USER_PROTECT__
 326         uintptr_t up = arm_user_protect_begin(thread);
 327 #endif  // __ARM_USER_PROTECT__
 328                 enable_fiq();
 329 #endif  // __arm__
 330                 ast_taken_kernel();                     // Handle urgent AST
 331 #if __arm__
 332 #if __ARM_USER_PROTECT__
 333                 arm_user_protect_end(thread, up, TRUE);
 334 #endif  // __ARM_USER_PROTECT__
 335                 enable_interrupts();
 336                 return;                                 // Return early on arm only due to FIQ enabling
 337 #endif  // __arm__
 338         }
 339         restore_interrupts(state);                      // Enable interrupts
 340         return;
 341
 342 update_count:
 343         ordered_store(&thread->machine.preemption_count, unsigned int, count);
 344         return;
 345 }
 346
 347 int get_preemption_level(void)
 348 {
 349         return current_thread()->machine.preemption_count;
 350 }
 351
 352 /* Forward declarations for unexported functions that are used externally */
 353 void hw_lock_bit(hw_lock_bit_t *lock, unsigned int bit);
 354 void hw_unlock_bit(hw_lock_bit_t *lock, unsigned int bit);
 355
 356 #if     __SMP__
 357 static unsigned int
 358 hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout);
 359 #endif
 360
 361 unsigned int
 362 hw_lock_bit_to(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout)
 363 {
 364         unsigned int success = 0;
 365         uint32_t        mask = (1 << bit);
 366 #if     !__SMP__
 367         uint32_t        state;
 368 #endif
 369
 370         _disable_preemption();
 371 #if     __SMP__
 372         if (__improbable(!atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE)))
 373                 success = hw_lock_bit_to_contended(lock, mask, timeout);
 374         else
 375                 success = 1;
 376 #else   // __SMP__
 377         (void)timeout;
 378         state = ordered_load_bit(lock);
 379         if (!(mask & state)) {
 380                 ordered_store_bit(lock, state | mask);
 381                 success = 1;
 382         }
 383 #endif  // __SMP__
 384
 385 #if CONFIG_DTRACE
 386         if (success)
 387                 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, bit);
 388 #endif
 389
 390         return success;
 391 }
 392
 393 #if     __SMP__
 394 static unsigned int NOINLINE
 395 hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout)
 396 {
 397         uint64_t        end = 0;
 398         int             i;
 399 #if CONFIG_DTRACE
 400         uint64_t begin;
 401         boolean_t dtrace_enabled = lockstat_probemap[LS_LCK_SPIN_LOCK_SPIN] != 0;
 402         if (__improbable(dtrace_enabled))
 403                 begin = mach_absolute_time();
 404 #endif
 405         for ( ; ; ) {
 406                 for (i = 0; i < LOCK_SNOOP_SPINS; i++) {
 407                         // Always load-exclusive before wfe
 408                         // This grabs the monitor and wakes up on a release event
 409                         if (atomic_test_and_set32(lock, mask, mask, memory_order_acquire, TRUE)) {
 410                                 goto end;
 411                         }
 412                 }
 413                 if (end == 0)
 414                         end = ml_get_timebase() + timeout;
 415                 else if (ml_get_timebase() >= end)
 416                         break;
 417         }
 418         return 0;
 419 end:
 420 #if CONFIG_DTRACE
 421         if (__improbable(dtrace_enabled)) {
 422                 uint64_t spintime = mach_absolute_time() - begin;
 423                 if (spintime > dtrace_spin_threshold)
 424                         LOCKSTAT_RECORD2(LS_LCK_SPIN_LOCK_SPIN, lock, spintime, mask);
 425         }
 426 #endif
 427         return 1;
 428 }
 429 #endif  // __SMP__
 430
 431 void
 432 hw_lock_bit(hw_lock_bit_t *lock, unsigned int bit)
 433 {
 434         if (hw_lock_bit_to(lock, bit, LOCK_PANIC_TIMEOUT))
 435                 return;
 436 #if     __SMP__
 437         panic("hw_lock_bit(): timed out (%p)", lock);
 438 #else
 439         panic("hw_lock_bit(): interlock held (%p)", lock);
 440 #endif
 441 }
 442
 443 unsigned int
 444 hw_lock_bit_try(hw_lock_bit_t *lock, unsigned int bit)
 445 {
 446         long            intmask;
 447         uint32_t        mask = (1 << bit);
 448 #if     !__SMP__
 449         uint32_t        state;
 450 #endif
 451         boolean_t       success = FALSE;
 452
 453         intmask = disable_interrupts();
 454 #if     __SMP__
 455         // TODO: consider weak (non-looping) atomic test-and-set
 456         success = atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE);
 457 #else
 458         state = ordered_load_bit(lock);
 459         if (!(mask & state)) {
 460                 ordered_store_bit(lock, state | mask);
 461                 success = TRUE;
 462         }
 463 #endif  // __SMP__
 464         if (success)
 465                 disable_preemption();
 466         restore_interrupts(intmask);
 467
 468 #if CONFIG_DTRACE
 469         if (success)
 470                 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, bit);
 471 #endif
 472
 473         return success;
 474 }
 475
 476 /*
 477  *      Routine:        hw_unlock_bit
 478  *
 479  *              Release spin-lock. The second parameter is the bit number to test and set.
 480  *              Decrement the preemption level.
 481  */
 482 void
 483 hw_unlock_bit(hw_lock_bit_t *lock, unsigned int bit)
 484 {
 485         uint32_t        mask = (1 << bit);
 486 #if     !__SMP__
 487         uint32_t        state;
 488 #endif
 489
 490 #if     __SMP__
 491         __c11_atomic_fetch_and((_Atomic uint32_t *)lock, ~mask, memory_order_release);
 492         set_event();
 493 #else   // __SMP__
 494         state = ordered_load_bit(lock);
 495         ordered_store_bit(lock, state & ~mask);
 496 #endif  // __SMP__
 497 #if CONFIG_DTRACE
 498         LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, bit);
 499 #endif
 500         enable_preemption();
 501 }
 502
 503
 504 /*
 505  *      Routine:        lck_spin_alloc_init
 506  */
 507 lck_spin_t     *
 508 lck_spin_alloc_init(
 509                 lck_grp_t * grp,
 510                 lck_attr_t * attr)
 511 {
 512         lck_spin_t     *lck;
 513
 514         if ((lck = (lck_spin_t *) kalloc(sizeof(lck_spin_t))) != 0)
 515                 lck_spin_init(lck, grp, attr);
 516
 517         return (lck);
 518 }
 519
 520 /*
 521  *      Routine:        lck_spin_free
 522  */
 523 void
 524 lck_spin_free(
 525               lck_spin_t * lck,
 526               lck_grp_t * grp)
 527 {
 528         lck_spin_destroy(lck, grp);
 529         kfree((void *) lck, sizeof(lck_spin_t));
 530 }
 531
 532 /*
 533  *      Routine:        lck_spin_init
 534  */
 535 void
 536 lck_spin_init(
 537               lck_spin_t * lck,
 538               lck_grp_t * grp,
 539               __unused lck_attr_t * attr)
 540 {
 541         hw_lock_init(&lck->hwlock);
 542         lck->type = LCK_SPIN_TYPE;
 543         lck_grp_reference(grp);
 544         lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 545         store_memory_barrier();
 546 }
 547
 548 /*
 549  * arm_usimple_lock is a lck_spin_t without a group or attributes
 550  */
 551 void inline
 552 arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
 553 {
 554         lck->type = LCK_SPIN_TYPE;
 555         hw_lock_init(&lck->hwlock);
 556         store_memory_barrier();
 557 }
 558
 559
 560 /*
 561  *      Routine:        lck_spin_lock
 562  */
 563 void
 564 lck_spin_lock(lck_spin_t *lock)
 565 {
 566 #if     DEVELOPMENT || DEBUG
 567         if (lock->type != LCK_SPIN_TYPE)
 568                 panic("Invalid spinlock %p", lock);
 569 #endif  // DEVELOPMENT || DEBUG
 570         hw_lock_lock(&lock->hwlock);
 571 }
 572
 573 /*
 574  *      Routine:        lck_spin_try_lock
 575  */
 576 int
 577 lck_spin_try_lock(lck_spin_t *lock)
 578 {
 579         return hw_lock_try(&lock->hwlock);
 580 }
 581
 582 /*
 583  *      Routine:        lck_spin_unlock
 584  */
 585 void
 586 lck_spin_unlock(lck_spin_t *lock)
 587 {
 588 #if     DEVELOPMENT || DEBUG
 589         if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC())
 590                 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
 591         if (lock->type != LCK_SPIN_TYPE)
 592                 panic("Invalid spinlock type %p", lock);
 593 #endif  // DEVELOPMENT || DEBUG
 594         hw_lock_unlock(&lock->hwlock);
 595 }
 596
 597 /*
 598  *      Routine:        lck_spin_destroy
 599  */
 600 void
 601 lck_spin_destroy(
 602                  lck_spin_t * lck,
 603                  lck_grp_t * grp)
 604 {
 605         if (lck->lck_spin_data == LCK_SPIN_TAG_DESTROYED)
 606                 return;
 607         lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED;
 608         lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 609         lck_grp_deallocate(grp);
 610 }
 611
 612 /*
 613  * Routine: kdp_lck_spin_is_acquired
 614  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 615  */
 616 boolean_t
 617 kdp_lck_spin_is_acquired(lck_spin_t *lck) {
 618         if (not_in_kdp) {
 619                 panic("panic: spinlock acquired check done outside of kernel debugger");
 620         }
 621         return ((lck->lck_spin_data & ~LCK_SPIN_TAG_DESTROYED) != 0) ? TRUE:FALSE;
 622 }
 623
 624 /*
 625  *      Initialize a usimple_lock.
 626  *
 627  *      No change in preemption state.
 628  */
 629 void
 630 usimple_lock_init(
 631                   usimple_lock_t l,
 632                   unsigned short tag)
 633 {
 634 #ifndef MACHINE_SIMPLE_LOCK
 635         USLDBG(usld_lock_init(l, tag));
 636         hw_lock_init(&l->lck_spin_data);
 637 #else
 638         simple_lock_init((simple_lock_t) l, tag);
 639 #endif
 640 }
 641
 642
 643 /*
 644  *      Acquire a usimple_lock.
 645  *
 646  *      Returns with preemption disabled.  Note
 647  *      that the hw_lock routines are responsible for
 648  *      maintaining preemption state.
 649  */
 650 void
 651 usimple_lock(
 652              usimple_lock_t l)
 653 {
 654 #ifndef MACHINE_SIMPLE_LOCK
 655         pc_t            pc;
 656
 657         OBTAIN_PC(pc, l);
 658         USLDBG(usld_lock_pre(l, pc));
 659
 660         if (!hw_lock_to(&l->lck_spin_data, LockTimeOut))        /* Try to get the lock
 661                                                          * with a timeout */
 662                 panic("simple lock deadlock detection - l=%p, cpu=%d, ret=%p", &l, cpu_number(), pc);
 663
 664         USLDBG(usld_lock_post(l, pc));
 665 #else
 666         simple_lock((simple_lock_t) l);
 667 #endif
 668 }
 669
 670
 671 extern void     sync(void);
 672
 673 /*
 674  *      Release a usimple_lock.
 675  *
 676  *      Returns with preemption enabled.  Note
 677  *      that the hw_lock routines are responsible for
 678  *      maintaining preemption state.
 679  */
 680 void
 681 usimple_unlock(
 682                usimple_lock_t l)
 683 {
 684 #ifndef MACHINE_SIMPLE_LOCK
 685         pc_t            pc;
 686
 687         OBTAIN_PC(pc, l);
 688         USLDBG(usld_unlock(l, pc));
 689         sync();
 690         hw_lock_unlock(&l->lck_spin_data);
 691 #else
 692         simple_unlock((simple_lock_t) l);
 693 #endif
 694 }
 695
 696
 697 /*
 698  *      Conditionally acquire a usimple_lock.
 699  *
 700  *      On success, returns with preemption disabled.
 701  *      On failure, returns with preemption in the same state
 702  *      as when first invoked.  Note that the hw_lock routines
 703  *      are responsible for maintaining preemption state.
 704  *
 705  *      XXX No stats are gathered on a miss; I preserved this
 706  *      behavior from the original assembly-language code, but
 707  *      doesn't it make sense to log misses?  XXX
 708  */
 709 unsigned int
 710 usimple_lock_try(
 711                  usimple_lock_t l)
 712 {
 713 #ifndef MACHINE_SIMPLE_LOCK
 714         pc_t            pc;
 715         unsigned int    success;
 716
 717         OBTAIN_PC(pc, l);
 718         USLDBG(usld_lock_try_pre(l, pc));
 719         if ((success = hw_lock_try(&l->lck_spin_data))) {
 720                 USLDBG(usld_lock_try_post(l, pc));
 721         }
 722         return success;
 723 #else
 724         return (simple_lock_try((simple_lock_t) l));
 725 #endif
 726 }
 727
 728 #if     USLOCK_DEBUG
 729 /*
 730  *      States of a usimple_lock.  The default when initializing
 731  *      a usimple_lock is setting it up for debug checking.
 732  */
 733 #define USLOCK_CHECKED          0x0001  /* lock is being checked */
 734 #define USLOCK_TAKEN            0x0002  /* lock has been taken */
 735 #define USLOCK_INIT             0xBAA0  /* lock has been initialized */
 736 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 737 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 738                                  ((l)->debug.state & USLOCK_CHECKED))
 739
 740 /*
 741  *      Trace activities of a particularly interesting lock.
 742  */
 743 void            usl_trace(usimple_lock_t, int, pc_t, const char *);
 744
 745
 746 /*
 747  *      Initialize the debugging information contained
 748  *      in a usimple_lock.
 749  */
 750 void
 751 usld_lock_init(
 752                usimple_lock_t l,
 753                __unused unsigned short tag)
 754 {
 755         if (l == USIMPLE_LOCK_NULL)
 756                 panic("lock initialization:  null lock pointer");
 757         l->lock_type = USLOCK_TAG;
 758         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 759         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 760         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 761         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 762         l->debug.duration[0] = l->debug.duration[1] = 0;
 763         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 764         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 765         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 766 }
 767
 768
 769 /*
 770  *      These checks apply to all usimple_locks, not just
 771  *      those with USLOCK_CHECKED turned on.
 772  */
 773 int
 774 usld_lock_common_checks(
 775                         usimple_lock_t l,
 776                         const char *caller)
 777 {
 778         if (l == USIMPLE_LOCK_NULL)
 779                 panic("%s:  null lock pointer", caller);
 780         if (l->lock_type != USLOCK_TAG)
 781                 panic("%s:  0x%x is not a usimple lock", caller, (integer_t) l);
 782         if (!(l->debug.state & USLOCK_INIT))
 783                 panic("%s:  0x%x is not an initialized lock",
 784                       caller, (integer_t) l);
 785         return USLOCK_CHECKING(l);
 786 }
 787
 788
 789 /*
 790  *      Debug checks on a usimple_lock just before attempting
 791  *      to acquire it.
 792  */
 793 /* ARGSUSED */
 794 void
 795 usld_lock_pre(
 796               usimple_lock_t l,
 797               pc_t pc)
 798 {
 799         const char     *caller = "usimple_lock";
 800
 801
 802         if (!usld_lock_common_checks(l, caller))
 803                 return;
 804
 805         /*
 806          *      Note that we have a weird case where we are getting a lock when we are]
 807          *      in the process of putting the system to sleep. We are running with no
 808          *      current threads, therefore we can't tell if we are trying to retake a lock
 809          *      we have or someone on the other processor has it.  Therefore we just
 810          *      ignore this test if the locking thread is 0.
 811          */
 812
 813         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 814             l->debug.lock_thread == (void *) current_thread()) {
 815                 printf("%s:  lock 0x%x already locked (at %p) by",
 816                        caller, (integer_t) l, l->debug.lock_pc);
 817                 printf(" current thread %p (new attempt at pc %p)\n",
 818                        l->debug.lock_thread, pc);
 819                 panic("%s", caller);
 820         }
 821         mp_disable_preemption();
 822         usl_trace(l, cpu_number(), pc, caller);
 823         mp_enable_preemption();
 824 }
 825
 826
 827 /*
 828  *      Debug checks on a usimple_lock just after acquiring it.
 829  *
 830  *      Pre-emption has been disabled at this point,
 831  *      so we are safe in using cpu_number.
 832  */
 833 void
 834 usld_lock_post(
 835                usimple_lock_t l,
 836                pc_t pc)
 837 {
 838         int             mycpu;
 839         const char     *caller = "successful usimple_lock";
 840
 841
 842         if (!usld_lock_common_checks(l, caller))
 843                 return;
 844
 845         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 846                 panic("%s:  lock 0x%x became uninitialized",
 847                       caller, (integer_t) l);
 848         if ((l->debug.state & USLOCK_TAKEN))
 849                 panic("%s:  lock 0x%x became TAKEN by someone else",
 850                       caller, (integer_t) l);
 851
 852         mycpu = cpu_number();
 853         l->debug.lock_thread = (void *) current_thread();
 854         l->debug.state |= USLOCK_TAKEN;
 855         l->debug.lock_pc = pc;
 856         l->debug.lock_cpu = mycpu;
 857
 858         usl_trace(l, mycpu, pc, caller);
 859 }
 860
 861
 862 /*
 863  *      Debug checks on a usimple_lock just before
 864  *      releasing it.  Note that the caller has not
 865  *      yet released the hardware lock.
 866  *
 867  *      Preemption is still disabled, so there's
 868  *      no problem using cpu_number.
 869  */
 870 void
 871 usld_unlock(
 872             usimple_lock_t l,
 873             pc_t pc)
 874 {
 875         int             mycpu;
 876         const char     *caller = "usimple_unlock";
 877
 878
 879         if (!usld_lock_common_checks(l, caller))
 880                 return;
 881
 882         mycpu = cpu_number();
 883
 884         if (!(l->debug.state & USLOCK_TAKEN))
 885                 panic("%s:  lock 0x%x hasn't been taken",
 886                       caller, (integer_t) l);
 887         if (l->debug.lock_thread != (void *) current_thread())
 888                 panic("%s:  unlocking lock 0x%x, owned by thread %p",
 889                       caller, (integer_t) l, l->debug.lock_thread);
 890         if (l->debug.lock_cpu != mycpu) {
 891                 printf("%s:  unlocking lock 0x%x on cpu 0x%x",
 892                        caller, (integer_t) l, mycpu);
 893                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 894                 panic("%s", caller);
 895         }
 896         usl_trace(l, mycpu, pc, caller);
 897
 898         l->debug.unlock_thread = l->debug.lock_thread;
 899         l->debug.lock_thread = INVALID_PC;
 900         l->debug.state &= ~USLOCK_TAKEN;
 901         l->debug.unlock_pc = pc;
 902         l->debug.unlock_cpu = mycpu;
 903 }
 904
 905
 906 /*
 907  *      Debug checks on a usimple_lock just before
 908  *      attempting to acquire it.
 909  *
 910  *      Preemption isn't guaranteed to be disabled.
 911  */
 912 void
 913 usld_lock_try_pre(
 914                   usimple_lock_t l,
 915                   pc_t pc)
 916 {
 917         const char     *caller = "usimple_lock_try";
 918
 919         if (!usld_lock_common_checks(l, caller))
 920                 return;
 921         mp_disable_preemption();
 922         usl_trace(l, cpu_number(), pc, caller);
 923         mp_enable_preemption();
 924 }
 925
 926
 927 /*
 928  *      Debug checks on a usimple_lock just after
 929  *      successfully attempting to acquire it.
 930  *
 931  *      Preemption has been disabled by the
 932  *      lock acquisition attempt, so it's safe
 933  *      to use cpu_number.
 934  */
 935 void
 936 usld_lock_try_post(
 937                    usimple_lock_t l,
 938                    pc_t pc)
 939 {
 940         int             mycpu;
 941         const char     *caller = "successful usimple_lock_try";
 942
 943         if (!usld_lock_common_checks(l, caller))
 944                 return;
 945
 946         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 947                 panic("%s:  lock 0x%x became uninitialized",
 948                       caller, (integer_t) l);
 949         if ((l->debug.state & USLOCK_TAKEN))
 950                 panic("%s:  lock 0x%x became TAKEN by someone else",
 951                       caller, (integer_t) l);
 952
 953         mycpu = cpu_number();
 954         l->debug.lock_thread = (void *) current_thread();
 955         l->debug.state |= USLOCK_TAKEN;
 956         l->debug.lock_pc = pc;
 957         l->debug.lock_cpu = mycpu;
 958
 959         usl_trace(l, mycpu, pc, caller);
 960 }
 961
 962
 963 /*
 964  *      For very special cases, set traced_lock to point to a
 965  *      specific lock of interest.  The result is a series of
 966  *      XPRs showing lock operations on that lock.  The lock_seq
 967  *      value is used to show the order of those operations.
 968  */
 969 usimple_lock_t  traced_lock;
 970 unsigned int    lock_seq;
 971
 972 void
 973 usl_trace(
 974           usimple_lock_t l,
 975           int mycpu,
 976           pc_t pc,
 977           const char *op_name)
 978 {
 979         if (traced_lock == l) {
 980                 XPR(XPR_SLOCK,
 981                     "seq %d, cpu %d, %s @ %x\n",
 982                     (integer_t) lock_seq, (integer_t) mycpu,
 983                     (integer_t) op_name, (integer_t) pc, 0);
 984                 lock_seq++;
 985         }
 986 }
 987
 988
 989 #endif                          /* USLOCK_DEBUG */
 990
 991 /*
 992  * The C portion of the shared/exclusive locks package.
 993  */
 994
 995 /*
 996  * compute the deadline to spin against when
 997  * waiting for a change of state on a lck_rw_t
 998  */
 999 #if     __SMP__
1000 static inline uint64_t
1001 lck_rw_deadline_for_spin(lck_rw_t *lck)
1002 {
1003         lck_rw_word_t   word;
1004
1005         word.data = ordered_load_rw(lck);
1006         if (word.can_sleep) {
1007                 if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
1008                         /*
1009                          * there are already threads waiting on this lock... this
1010                          * implies that they have spun beyond their deadlines waiting for
1011                          * the desired state to show up so we will not bother spinning at this time...
1012                          *   or
1013                          * the current number of threads sharing this lock exceeds our capacity to run them
1014                          * concurrently and since all states we're going to spin for require the rw_shared_count
1015                          * to be at 0, we'll not bother spinning since the latency for this to happen is
1016                          * unpredictable...
1017                          */
1018                         return (mach_absolute_time());
1019                 }
1020                 return (mach_absolute_time() + MutexSpin);
1021         } else
1022                 return (mach_absolute_time() + (100000LL * 1000000000LL));
1023 }
1024 #endif  // __SMP__
1025
1026 static boolean_t
1027 lck_rw_drain_status(lck_rw_t *lock, uint32_t status_mask, boolean_t wait __unused)
1028 {
1029 #if     __SMP__
1030         uint64_t        deadline = 0;
1031         uint32_t        data;
1032
1033         if (wait)
1034                 deadline = lck_rw_deadline_for_spin(lock);
1035
1036         for ( ; ; ) {
1037                 data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
1038                 if ((data & status_mask) == 0)
1039                         break;
1040                 if (wait)
1041                         wait_for_event();
1042                 else
1043                         clear_exclusive();
1044                 if (!wait || (mach_absolute_time() >= deadline))
1045                         return FALSE;
1046         }
1047         clear_exclusive();
1048         return TRUE;
1049 #else
1050         uint32_t        data;
1051
1052         data = ordered_load_rw(lock);
1053         if ((data & status_mask) == 0)
1054                 return TRUE;
1055         else
1056                 return FALSE;
1057 #endif  // __SMP__
1058 }
1059
1060 /*
1061  * Spin while interlock is held.
1062  */
1063 static inline void
1064 lck_rw_interlock_spin(lck_rw_t *lock)
1065 {
1066 #if __SMP__
1067         uint32_t        data;
1068
1069         for ( ; ; ) {
1070                 data = load_exclusive32(&lock->lck_rw_data, memory_order_relaxed);
1071                 if (data & LCK_RW_INTERLOCK)
1072                         wait_for_event();
1073                 else {
1074                         clear_exclusive();
1075                         return;
1076                 }
1077         }
1078 #else
1079         panic("lck_rw_interlock_spin(): Interlock locked %p %x", lock, lock->lck_rw_data);
1080 #endif
1081 }
1082
1083 /*
1084  * We disable interrupts while holding the RW interlock to prevent an
1085  * interrupt from exacerbating hold time.
1086  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
1087  */
1088 static inline boolean_t
1089 lck_interlock_lock(lck_rw_t *lck)
1090 {
1091         boolean_t       istate;
1092
1093         istate = ml_set_interrupts_enabled(FALSE);
1094         lck_rw_ilk_lock(lck);
1095         return istate;
1096 }
1097
1098 static inline void
1099 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
1100 {
1101         lck_rw_ilk_unlock(lck);
1102         ml_set_interrupts_enabled(istate);
1103 }
1104
1105
1106 #define LCK_RW_GRAB_WANT        0
1107 #define LCK_RW_GRAB_SHARED      1
1108
1109 static boolean_t
1110 lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait)
1111 {
1112         uint64_t        deadline = 0;
1113         uint32_t        data, prev;
1114         boolean_t       do_exch;
1115
1116 #if __SMP__
1117         if (wait)
1118                 deadline = lck_rw_deadline_for_spin(lock);
1119 #else
1120         wait = FALSE;   // Don't spin on UP systems
1121 #endif
1122
1123         for ( ; ; ) {
1124                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1125                 if (data & LCK_RW_INTERLOCK) {
1126                         atomic_exchange_abort();
1127                         lck_rw_interlock_spin(lock);
1128                         continue;
1129                 }
1130                 do_exch = FALSE;
1131                 if (mode == LCK_RW_GRAB_WANT) {
1132                         if ((data & LCK_RW_WANT_EXCL) == 0) {
1133                                 data |= LCK_RW_WANT_EXCL;
1134                                 do_exch = TRUE;
1135                         }
1136                 } else {        // LCK_RW_GRAB_SHARED
1137                         if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
1138                                 (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
1139                                 data += LCK_RW_SHARED_READER;
1140                                 do_exch = TRUE;
1141                         }
1142                 }
1143                 if (do_exch) {
1144                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp))
1145                                 return TRUE;
1146                 } else {
1147                         if (wait)                                               // Non-waiting
1148                                 wait_for_event();
1149                         else
1150                                 atomic_exchange_abort();
1151                         if (!wait || (mach_absolute_time() >= deadline))
1152                                 return FALSE;
1153                 }
1154         }
1155 }
1156
1157
1158 /*
1159  *      Routine:        lck_rw_alloc_init
1160  */
1161 lck_rw_t *
1162 lck_rw_alloc_init(
1163         lck_grp_t       *grp,
1164         lck_attr_t      *attr)
1165 {
1166         lck_rw_t        *lck;
1167
1168         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0)
1169                 lck_rw_init(lck, grp, attr);
1170
1171         return lck;
1172 }
1173
1174 /*
1175  *      Routine:        lck_rw_free
1176  */
1177 void
1178 lck_rw_free(
1179         lck_rw_t        *lck,
1180         lck_grp_t       *grp)
1181 {
1182         lck_rw_destroy(lck, grp);
1183         kfree(lck, sizeof(lck_rw_t));
1184 }
1185
1186 /*
1187  *      Routine:        lck_rw_init
1188  */
1189 void
1190 lck_rw_init(
1191         lck_rw_t        *lck,
1192         lck_grp_t       *grp,
1193         lck_attr_t      *attr)
1194 {
1195         if (attr == LCK_ATTR_NULL)
1196                 attr = &LockDefaultLckAttr;
1197         memset(lck, 0, sizeof(lck_rw_t));
1198         lck->lck_rw_can_sleep = TRUE;
1199         if ((attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY) == 0)
1200                 lck->lck_rw_priv_excl = TRUE;
1201
1202         lck_grp_reference(grp);
1203         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
1204 }
1205
1206
1207 /*
1208  *      Routine:        lck_rw_destroy
1209  */
1210 void
1211 lck_rw_destroy(
1212         lck_rw_t        *lck,
1213         lck_grp_t       *grp)
1214 {
1215         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
1216                 return;
1217 #if MACH_LDEBUG
1218         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
1219 #endif
1220         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
1221         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
1222         lck_grp_deallocate(grp);
1223         return;
1224 }
1225
1226 /*
1227  *      Routine:        lck_rw_lock
1228  */
1229 void
1230 lck_rw_lock(
1231         lck_rw_t                *lck,
1232         lck_rw_type_t   lck_rw_type)
1233 {
1234         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1235                 lck_rw_lock_shared(lck);
1236         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1237                 lck_rw_lock_exclusive(lck);
1238         else
1239                 panic("lck_rw_lock(): Invalid RW lock type: %x", lck_rw_type);
1240 }
1241
1242 /*
1243  *      Routine:        lck_rw_lock_exclusive
1244  */
1245 void
1246 lck_rw_lock_exclusive(lck_rw_t *lock)
1247 {
1248         thread_t        thread = current_thread();
1249
1250         thread->rwlock_count++;
1251         if (atomic_test_and_set32(&lock->lck_rw_data,
1252                 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1253                 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1254 #if     CONFIG_DTRACE
1255                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1256 #endif  /* CONFIG_DTRACE */
1257         } else
1258                 lck_rw_lock_exclusive_gen(lock);
1259 #if MACH_ASSERT
1260         thread_t owner = ordered_load_rw_owner(lock);
1261         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1262 #endif
1263         ordered_store_rw_owner(lock, thread);
1264 }
1265
1266 /*
1267  *      Routine:        lck_rw_lock_shared
1268  */
1269 void
1270 lck_rw_lock_shared(lck_rw_t *lock)
1271 {
1272         uint32_t        data, prev;
1273
1274         current_thread()->rwlock_count++;
1275         for ( ; ; ) {
1276                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1277                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1278                         atomic_exchange_abort();
1279                         lck_rw_lock_shared_gen(lock);
1280                         break;
1281                 }
1282                 data += LCK_RW_SHARED_READER;
1283                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp))
1284                         break;
1285                 cpu_pause();
1286         }
1287 #if MACH_ASSERT
1288         thread_t owner = ordered_load_rw_owner(lock);
1289         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1290 #endif
1291 #if     CONFIG_DTRACE
1292         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1293 #endif  /* CONFIG_DTRACE */
1294         return;
1295 }
1296
1297 /*
1298  *      Routine:        lck_rw_lock_shared_to_exclusive
1299  */
1300 boolean_t
1301 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1302 {
1303         uint32_t        data, prev;
1304
1305         for ( ; ; ) {
1306                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1307                 if (data & LCK_RW_INTERLOCK) {
1308                         atomic_exchange_abort();
1309                         lck_rw_interlock_spin(lock);
1310                         continue;
1311                 }
1312                 if (data & LCK_RW_WANT_UPGRADE) {
1313                         data -= LCK_RW_SHARED_READER;
1314                         if ((data & LCK_RW_SHARED_MASK) == 0)           /* we were the last reader */
1315                                 data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1316                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp))
1317                                 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1318                 } else {
1319                         data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1320                         data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1321                         if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp))
1322                                 break;
1323                 }
1324                 cpu_pause();
1325         }
1326                                                                                 /* we now own the WANT_UPGRADE */
1327         if (data & LCK_RW_SHARED_MASK)          /* check to see if all of the readers are drained */
1328                 lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1329 #if MACH_ASSERT
1330         thread_t owner = ordered_load_rw_owner(lock);
1331         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1332 #endif
1333         ordered_store_rw_owner(lock, current_thread());
1334 #if     CONFIG_DTRACE
1335         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1336 #endif  /* CONFIG_DTRACE */
1337         return TRUE;
1338 }
1339
1340
1341 /*
1342  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1343  *      Function:
1344  *              Fast path code has already dropped our read
1345  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1346  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1347  *              all we need to do here is determine if a wakeup is needed
1348  */
1349 static boolean_t
1350 lck_rw_lock_shared_to_exclusive_failure(
1351         lck_rw_t        *lck,
1352         uint32_t        prior_lock_state)
1353 {
1354         thread_t        thread = current_thread();
1355         uint32_t        rwlock_count;
1356
1357         /* Check if dropping the lock means that we need to unpromote */
1358         rwlock_count = thread->rwlock_count--;
1359 #if MACH_LDEBUG
1360         if (rwlock_count == 0) {
1361                 panic("rw lock count underflow for thread %p", thread);
1362         }
1363 #endif
1364         if ((prior_lock_state & LCK_RW_W_WAITING) &&
1365                 ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1366                 /*
1367                  *      Someone else has requested upgrade.
1368                  *      Since we've released the read lock, wake
1369                  *      him up if he's blocked waiting
1370                  */
1371                 thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1372         }
1373
1374         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1375                 /* sched_flags checked without lock, but will be rechecked while clearing */
1376                 lck_rw_clear_promotion(thread);
1377         }
1378
1379         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1380                      VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1381
1382         return (FALSE);
1383 }
1384
1385 /*
1386  *      Routine:        lck_rw_lock_shared_to_exclusive_success
1387  *      Function:
1388  *              assembly fast path code has already dropped our read
1389  *              count and successfully acquired 'lck_rw_want_upgrade'
1390  *              we just need to wait for the rest of the readers to drain
1391  *              and then we can return as the exclusive holder of this lock
1392  */
1393 static boolean_t
1394 lck_rw_lock_shared_to_exclusive_success(
1395         lck_rw_t        *lock)
1396 {
1397         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1398         int                     slept = 0;
1399         lck_rw_word_t           word;
1400         wait_result_t           res;
1401         boolean_t               istate;
1402         boolean_t               not_shared;
1403
1404 #if     CONFIG_DTRACE
1405         uint64_t                wait_interval = 0;
1406         int                     readers_at_sleep = 0;
1407         boolean_t               dtrace_ls_initialized = FALSE;
1408         boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1409 #endif
1410
1411         while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE)) {
1412
1413                 word.data = ordered_load_rw(lock);
1414 #if     CONFIG_DTRACE
1415                 if (dtrace_ls_initialized == FALSE) {
1416                         dtrace_ls_initialized = TRUE;
1417                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1418                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1419                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1420                         if (dtrace_ls_enabled) {
1421                                 /*
1422                                  * Either sleeping or spinning is happening,
1423                                  *  start a timing of our delay interval now.
1424                                  */
1425                                 readers_at_sleep = word.shared_count;
1426                                 wait_interval = mach_absolute_time();
1427                         }
1428                 }
1429 #endif
1430
1431                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1432                              trace_lck, word.shared_count, 0, 0, 0);
1433
1434                 not_shared = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE);
1435
1436                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1437                              trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1438
1439                 if (not_shared)
1440                         break;
1441
1442                 /*
1443                  * if we get here, the spin deadline in lck_rw_wait_on_status()
1444                  * has expired w/o the rw_shared_count having drained to 0
1445                  * check to see if we're allowed to do a thread_block
1446                  */
1447                 if (word.can_sleep) {
1448
1449                         istate = lck_interlock_lock(lock);
1450
1451                         word.data = ordered_load_rw(lock);
1452                         if (word.shared_count != 0) {
1453                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1454                                              trace_lck, word.shared_count, 0, 0, 0);
1455
1456                                 word.w_waiting = 1;
1457                                 ordered_store_rw(lock, word.data);
1458
1459                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1460                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock), THREAD_UNINT);
1461                                 lck_interlock_unlock(lock, istate);
1462
1463                                 if (res == THREAD_WAITING) {
1464                                         res = thread_block(THREAD_CONTINUE_NULL);
1465                                         slept++;
1466                                 }
1467                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1468                                              trace_lck, res, slept, 0, 0);
1469                         } else {
1470                                 lck_interlock_unlock(lock, istate);
1471                                 break;
1472                         }
1473                 }
1474         }
1475 #if     CONFIG_DTRACE
1476         /*
1477          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1478          */
1479         if (dtrace_ls_enabled == TRUE) {
1480                 if (slept == 0) {
1481                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1482                 } else {
1483                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1484                             mach_absolute_time() - wait_interval, 1,
1485                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1486                 }
1487         }
1488         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1489 #endif
1490         return (TRUE);
1491 }
1492
1493
1494 /*
1495  *      Routine:        lck_rw_lock_exclusive_to_shared
1496  */
1497
1498 void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1499 {
1500         uint32_t        data, prev;
1501
1502         assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1503         ordered_store_rw_owner(lock, THREAD_NULL);
1504         for ( ; ; ) {
1505                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1506                 if (data & LCK_RW_INTERLOCK) {
1507 #if __SMP__
1508                         atomic_exchange_abort();
1509                         lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1510                         continue;
1511 #else
1512                         panic("lck_rw_lock_exclusive_to_shared(): Interlock locked (%p): %x", lock, data);
1513 #endif // __SMP__
1514                 }
1515                 data += LCK_RW_SHARED_READER;
1516                 if (data & LCK_RW_WANT_UPGRADE)
1517                         data &= ~(LCK_RW_WANT_UPGRADE);
1518                 else
1519                         data &= ~(LCK_RW_WANT_EXCL);
1520                 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL)))
1521                         data &= ~(LCK_RW_W_WAITING);
1522                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp))
1523                         break;
1524                 cpu_pause();
1525         }
1526         return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1527 }
1528
1529 /*
1530  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1531  *      Function:
1532  *              Fast path has already dropped
1533  *              our exclusive state and bumped lck_rw_shared_count
1534  *              all we need to do here is determine if anyone
1535  *              needs to be awakened.
1536  */
1537 static void
1538 lck_rw_lock_exclusive_to_shared_gen(
1539         lck_rw_t        *lck,
1540         uint32_t        prior_lock_state)
1541 {
1542         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1543         lck_rw_word_t   fake_lck;
1544
1545         /*
1546          * prior_lock state is a snapshot of the 1st word of the
1547          * lock in question... we'll fake up a pointer to it
1548          * and carefully not access anything beyond whats defined
1549          * in the first word of a lck_rw_t
1550          */
1551         fake_lck.data = prior_lock_state;
1552
1553         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1554                              trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1555
1556         /*
1557          * don't wake up anyone waiting to take the lock exclusively
1558          * since we hold a read count... when the read count drops to 0,
1559          * the writers will be woken.
1560          *
1561          * wake up any waiting readers if we don't have any writers waiting,
1562          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1563          */
1564         if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting)
1565                 thread_wakeup(LCK_RW_READER_EVENT(lck));
1566
1567         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1568                              trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1569
1570 #if CONFIG_DTRACE
1571         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1572 #endif
1573 }
1574
1575
1576 /*
1577  *      Routine:        lck_rw_try_lock
1578  */
1579 boolean_t
1580 lck_rw_try_lock(
1581         lck_rw_t                *lck,
1582         lck_rw_type_t   lck_rw_type)
1583 {
1584         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1585                 return lck_rw_try_lock_shared(lck);
1586         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1587                 return lck_rw_try_lock_exclusive(lck);
1588         else
1589                 panic("lck_rw_try_lock(): Invalid rw lock type: %x", lck_rw_type);
1590         return FALSE;
1591 }
1592
1593 /*
1594  *      Routine:        lck_rw_try_lock_shared
1595  */
1596
1597 boolean_t lck_rw_try_lock_shared(lck_rw_t *lock)
1598 {
1599         uint32_t        data, prev;
1600
1601         for ( ; ; ) {
1602                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1603                 if (data & LCK_RW_INTERLOCK) {
1604 #if __SMP__
1605                         atomic_exchange_abort();
1606                         lck_rw_interlock_spin(lock);
1607                         continue;
1608 #else
1609                         panic("lck_rw_try_lock_shared(): Interlock locked (%p): %x", lock, data);
1610 #endif
1611                 }
1612                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1613                         atomic_exchange_abort();
1614                         return FALSE;                                           /* lock is busy */
1615                 }
1616                 data += LCK_RW_SHARED_READER;                   /* Increment reader refcount */
1617                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp))
1618                         break;
1619                 cpu_pause();
1620         }
1621 #if MACH_ASSERT
1622         thread_t owner = ordered_load_rw_owner(lock);
1623         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1624 #endif
1625         current_thread()->rwlock_count++;
1626 #if     CONFIG_DTRACE
1627         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1628 #endif  /* CONFIG_DTRACE */
1629         return TRUE;
1630 }
1631
1632
1633 /*
1634  *      Routine:        lck_rw_try_lock_exclusive
1635  */
1636
1637 boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock)
1638 {
1639         uint32_t        data, prev;
1640         thread_t        thread;
1641
1642         for ( ; ; ) {
1643                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1644                 if (data & LCK_RW_INTERLOCK) {
1645 #if __SMP__
1646                         atomic_exchange_abort();
1647                         lck_rw_interlock_spin(lock);
1648                         continue;
1649 #else
1650                         panic("lck_rw_try_lock_exclusive(): Interlock locked (%p): %x", lock, data);
1651 #endif
1652                 }
1653                 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1654                         atomic_exchange_abort();
1655                         return FALSE;
1656                 }
1657                 data |= LCK_RW_WANT_EXCL;
1658                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp))
1659                         break;
1660                 cpu_pause();
1661         }
1662         thread = current_thread();
1663         thread->rwlock_count++;
1664 #if MACH_ASSERT
1665         thread_t owner = ordered_load_rw_owner(lock);
1666         assertf(owner == THREAD_NULL, "state=0x%x, owner=%p", ordered_load_rw(lock), owner);
1667 #endif
1668         ordered_store_rw_owner(lock, thread);
1669 #if     CONFIG_DTRACE
1670         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1671 #endif  /* CONFIG_DTRACE */
1672         return TRUE;
1673 }
1674
1675
1676 /*
1677  *      Routine:        lck_rw_unlock
1678  */
1679 void
1680 lck_rw_unlock(
1681         lck_rw_t                *lck,
1682         lck_rw_type_t   lck_rw_type)
1683 {
1684         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1685                 lck_rw_unlock_shared(lck);
1686         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1687                 lck_rw_unlock_exclusive(lck);
1688         else
1689                 panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
1690 }
1691
1692
1693 /*
1694  *      Routine:        lck_rw_unlock_shared
1695  */
1696 void
1697 lck_rw_unlock_shared(
1698         lck_rw_t        *lck)
1699 {
1700         lck_rw_type_t   ret;
1701
1702         assertf(lck->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1703         assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
1704         ret = lck_rw_done(lck);
1705
1706         if (ret != LCK_RW_TYPE_SHARED)
1707                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
1708 }
1709
1710
1711 /*
1712  *      Routine:        lck_rw_unlock_exclusive
1713  */
1714 void
1715 lck_rw_unlock_exclusive(
1716         lck_rw_t        *lck)
1717 {
1718         lck_rw_type_t   ret;
1719
1720         assertf(lck->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lck->lck_rw_data, lck->lck_rw_owner);
1721         ret = lck_rw_done(lck);
1722
1723         if (ret != LCK_RW_TYPE_EXCLUSIVE)
1724                 panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
1725 }
1726
1727
1728 /*
1729  *      Routine:        lck_rw_lock_exclusive_gen
1730  */
1731 static void
1732 lck_rw_lock_exclusive_gen(
1733         lck_rw_t        *lock)
1734 {
1735         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1736         lck_rw_word_t           word;
1737         int                     slept = 0;
1738         boolean_t               gotlock = 0;
1739         boolean_t               not_shared_or_upgrade = 0;
1740         wait_result_t           res = 0;
1741         boolean_t               istate;
1742
1743 #if     CONFIG_DTRACE
1744         boolean_t dtrace_ls_initialized = FALSE;
1745         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
1746         uint64_t wait_interval = 0;
1747         int readers_at_sleep = 0;
1748 #endif
1749
1750         /*
1751          *      Try to acquire the lck_rw_want_excl bit.
1752          */
1753         while (!lck_rw_grab(lock, LCK_RW_GRAB_WANT, FALSE)) {
1754
1755 #if     CONFIG_DTRACE
1756                 if (dtrace_ls_initialized == FALSE) {
1757                         dtrace_ls_initialized = TRUE;
1758                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1759                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1760                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1761                         if (dtrace_ls_enabled) {
1762                                 /*
1763                                  * Either sleeping or spinning is happening,
1764                                  *  start a timing of our delay interval now.
1765                                  */
1766                                 readers_at_sleep = lock->lck_rw_shared_count;
1767                                 wait_interval = mach_absolute_time();
1768                         }
1769                 }
1770 #endif
1771
1772                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1773
1774                 gotlock = lck_rw_grab(lock, LCK_RW_GRAB_WANT, TRUE);
1775
1776                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1777
1778                 if (gotlock)
1779                         break;
1780                 /*
1781                  * if we get here, the deadline has expired w/o us
1782                  * being able to grab the lock exclusively
1783                  * check to see if we're allowed to do a thread_block
1784                  */
1785                 word.data = ordered_load_rw(lock);
1786                 if (word.can_sleep) {
1787
1788                         istate = lck_interlock_lock(lock);
1789                         word.data = ordered_load_rw(lock);
1790
1791                         if (word.want_excl) {
1792
1793                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1794
1795                                 word.w_waiting = 1;
1796                                 ordered_store_rw(lock, word.data);
1797
1798                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1799                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock), THREAD_UNINT);
1800                                 lck_interlock_unlock(lock, istate);
1801
1802                                 if (res == THREAD_WAITING) {
1803                                         res = thread_block(THREAD_CONTINUE_NULL);
1804                                         slept++;
1805                                 }
1806                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1807                         } else {
1808                                 word.want_excl = 1;
1809                                 ordered_store_rw(lock, word.data);
1810                                 lck_interlock_unlock(lock, istate);
1811                                 break;
1812                         }
1813                 }
1814         }
1815         /*
1816          * Wait for readers (and upgrades) to finish...
1817          */
1818         while (!lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE)) {
1819
1820 #if     CONFIG_DTRACE
1821                 /*
1822                  * Either sleeping or spinning is happening, start
1823                  * a timing of our delay interval now.  If we set it
1824                  * to -1 we don't have accurate data so we cannot later
1825                  * decide to record a dtrace spin or sleep event.
1826                  */
1827                 if (dtrace_ls_initialized == FALSE) {
1828                         dtrace_ls_initialized = TRUE;
1829                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1830                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1831                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1832                         if (dtrace_ls_enabled) {
1833                                 /*
1834                                  * Either sleeping or spinning is happening,
1835                                  *  start a timing of our delay interval now.
1836                                  */
1837                                 readers_at_sleep = lock->lck_rw_shared_count;
1838                                 wait_interval = mach_absolute_time();
1839                         }
1840                 }
1841 #endif
1842
1843                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1844
1845                 not_shared_or_upgrade = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE);
1846
1847                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, not_shared_or_upgrade, 0);
1848
1849                 if (not_shared_or_upgrade)
1850                         break;
1851                 /*
1852                  * if we get here, the deadline has expired w/o us
1853                  * being able to grab the lock exclusively
1854                  * check to see if we're allowed to do a thread_block
1855                  */
1856                 word.data = ordered_load_rw(lock);
1857                 if (word.can_sleep) {
1858
1859                         istate = lck_interlock_lock(lock);
1860                         word.data = ordered_load_rw(lock);
1861
1862                         if (word.shared_count != 0 || word.want_upgrade) {
1863                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1864
1865                                 word.w_waiting = 1;
1866                                 ordered_store_rw(lock, word.data);
1867
1868                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1869                                 res = assert_wait(LCK_RW_WRITER_EVENT(lock), THREAD_UNINT);
1870                                 lck_interlock_unlock(lock, istate);
1871
1872                                 if (res == THREAD_WAITING) {
1873                                         res = thread_block(THREAD_CONTINUE_NULL);
1874                                         slept++;
1875                                 }
1876                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1877                         } else {
1878                                 lck_interlock_unlock(lock, istate);
1879                                 /*
1880                                  * must own the lock now, since we checked for
1881                                  * readers or upgrade owner behind the interlock
1882                                  * no need for a call to 'lck_rw_drain_status'
1883                                  */
1884                                 break;
1885                         }
1886                 }
1887         }
1888
1889 #if     CONFIG_DTRACE
1890         /*
1891          * Decide what latencies we suffered that are Dtrace events.
1892          * If we have set wait_interval, then we either spun or slept.
1893          * At least we get out from under the interlock before we record
1894          * which is the best we can do here to minimize the impact
1895          * of the tracing.
1896          * If we have set wait_interval to -1, then dtrace was not enabled when we
1897          * started sleeping/spinning so we don't record this event.
1898          */
1899         if (dtrace_ls_enabled == TRUE) {
1900                 if (slept == 0) {
1901                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1902                             mach_absolute_time() - wait_interval, 1);
1903                 } else {
1904                         /*
1905                          * For the blocking case, we also record if when we blocked
1906                          * it was held for read or write, and how many readers.
1907                          * Notice that above we recorded this before we dropped
1908                          * the interlock so the count is accurate.
1909                          */
1910                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1911                             mach_absolute_time() - wait_interval, 1,
1912                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1913                 }
1914         }
1915         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1916 #endif  /* CONFIG_DTRACE */
1917 }
1918
1919 /*
1920  *      Routine:        lck_rw_done
1921  */
1922
1923 lck_rw_type_t lck_rw_done(lck_rw_t *lock)
1924 {
1925         uint32_t        data, prev;
1926         boolean_t       once = FALSE;
1927
1928         for ( ; ; ) {
1929                 data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1930                 if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
1931 #if __SMP__
1932                         atomic_exchange_abort();
1933                         lck_rw_interlock_spin(lock);
1934                         continue;
1935 #else
1936                         panic("lck_rw_done(): Interlock locked (%p): %x", lock, data);
1937 #endif // __SMP__
1938                 }
1939                 if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
1940                         assertf(lock->lck_rw_owner == THREAD_NULL, "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1941                         data -= LCK_RW_SHARED_READER;
1942                         if ((data & LCK_RW_SHARED_MASK) == 0)   /* if reader count has now gone to 0, check for waiters */
1943                                 goto check_waiters;
1944                 } else {                                        /* if reader count == 0, must be exclusive lock */
1945                         if (data & LCK_RW_WANT_UPGRADE) {
1946                                 data &= ~(LCK_RW_WANT_UPGRADE);
1947                         } else {
1948                                 if (data & LCK_RW_WANT_EXCL)
1949                                         data &= ~(LCK_RW_WANT_EXCL);
1950                                 else                                    /* lock is not 'owned', panic */
1951                                         panic("Releasing non-exclusive RW lock without a reader refcount!");
1952                         }
1953                         if (!once) {
1954                                 // Only check for holder and clear it once
1955                                 assertf(lock->lck_rw_owner == current_thread(), "state=0x%x, owner=%p", lock->lck_rw_data, lock->lck_rw_owner);
1956                                 ordered_store_rw_owner(lock, THREAD_NULL);
1957                                 once = TRUE;
1958                         }
1959 check_waiters:
1960                         /*
1961                          * test the original values to match what
1962                          * lck_rw_done_gen is going to do to determine
1963                          * which wakeups need to happen...
1964                          *
1965                          * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
1966                          */
1967                         if (prev & LCK_RW_W_WAITING) {
1968                                 data &= ~(LCK_RW_W_WAITING);
1969                                 if ((prev & LCK_RW_PRIV_EXCL) == 0)
1970                                         data &= ~(LCK_RW_R_WAITING);
1971                         } else
1972                                 data &= ~(LCK_RW_R_WAITING);
1973                 }
1974                 if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp))
1975                         break;
1976                 cpu_pause();
1977         }
1978         return lck_rw_done_gen(lock, prev);
1979 }
1980
1981 /*
1982  *      Routine:        lck_rw_done_gen
1983  *
1984  *      called from the assembly language wrapper...
1985  *      prior_lock_state is the value in the 1st
1986  *      word of the lock at the time of a successful
1987  *      atomic compare and exchange with the new value...
1988  *      it represents the state of the lock before we
1989  *      decremented the rw_shared_count or cleared either
1990  *      rw_want_upgrade or rw_want_write and
1991  *      the lck_x_waiting bits...  since the wrapper
1992  *      routine has already changed the state atomically,
1993  *      we just need to decide if we should
1994  *      wake up anyone and what value to return... we do
1995  *      this by examining the state of the lock before
1996  *      we changed it
1997  */
1998 static lck_rw_type_t
1999 lck_rw_done_gen(
2000         lck_rw_t        *lck,
2001         uint32_t        prior_lock_state)
2002 {
2003         lck_rw_word_t   fake_lck;
2004         lck_rw_type_t   lock_type;
2005         thread_t                thread;
2006         uint32_t                rwlock_count;
2007
2008         /*
2009          * prior_lock state is a snapshot of the 1st word of the
2010          * lock in question... we'll fake up a pointer to it
2011          * and carefully not access anything beyond whats defined
2012          * in the first word of a lck_rw_t
2013          */
2014         fake_lck.data = prior_lock_state;
2015
2016         if (fake_lck.shared_count <= 1) {
2017                 if (fake_lck.w_waiting)
2018                         thread_wakeup(LCK_RW_WRITER_EVENT(lck));
2019
2020                 if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting)
2021                         thread_wakeup(LCK_RW_READER_EVENT(lck));
2022         }
2023         if (fake_lck.shared_count)
2024                 lock_type = LCK_RW_TYPE_SHARED;
2025         else
2026                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
2027
2028         /* Check if dropping the lock means that we need to unpromote */
2029         thread = current_thread();
2030         rwlock_count = thread->rwlock_count--;
2031 #if MACH_LDEBUG
2032         if (rwlock_count == 0)
2033                 panic("rw lock count underflow for thread %p", thread);
2034 #endif
2035         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2036                 /* sched_flags checked without lock, but will be rechecked while clearing */
2037                 lck_rw_clear_promotion(thread);
2038         }
2039 #if CONFIG_DTRACE
2040         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2041 #endif
2042         return lock_type;
2043 }
2044
2045 /*
2046  *      Routine:        lck_rw_lock_shared_gen
2047  *      Function:
2048  *              Fast path code has determined that this lock
2049  *              is held exclusively... this is where we spin/block
2050  *              until we can acquire the lock in the shared mode
2051  */
2052 static void
2053 lck_rw_lock_shared_gen(
2054         lck_rw_t        *lck)
2055 {
2056         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
2057         lck_rw_word_t           word;
2058         boolean_t               gotlock = 0;
2059         int                     slept = 0;
2060         wait_result_t           res = 0;
2061         boolean_t               istate;
2062
2063 #if     CONFIG_DTRACE
2064         uint64_t wait_interval = 0;
2065         int readers_at_sleep = 0;
2066         boolean_t dtrace_ls_initialized = FALSE;
2067         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
2068 #endif /* CONFIG_DTRACE */
2069
2070         while ( !lck_rw_grab(lck, LCK_RW_GRAB_SHARED, FALSE)) {
2071
2072 #if     CONFIG_DTRACE
2073                 if (dtrace_ls_initialized == FALSE) {
2074                         dtrace_ls_initialized = TRUE;
2075                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
2076                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
2077                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
2078                         if (dtrace_ls_enabled) {
2079                                 /*
2080                                  * Either sleeping or spinning is happening,
2081                                  *  start a timing of our delay interval now.
2082                                  */
2083                                 readers_at_sleep = lck->lck_rw_shared_count;
2084                                 wait_interval = mach_absolute_time();
2085                         }
2086                 }
2087 #endif
2088
2089                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
2090                              trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
2091
2092                 gotlock = lck_rw_grab(lck, LCK_RW_GRAB_SHARED, TRUE);
2093
2094                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
2095                              trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, gotlock, 0);
2096
2097                 if (gotlock)
2098                         break;
2099                 /*
2100                  * if we get here, the deadline has expired w/o us
2101                  * being able to grab the lock for read
2102                  * check to see if we're allowed to do a thread_block
2103                  */
2104                 if (lck->lck_rw_can_sleep) {
2105
2106                         istate = lck_interlock_lock(lck);
2107
2108                         word.data = ordered_load_rw(lck);
2109                         if ((word.want_excl || word.want_upgrade) &&
2110                             ((word.shared_count == 0) || word.priv_excl)) {
2111
2112                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
2113                                              trace_lck, word.want_excl, word.want_upgrade, 0, 0);
2114
2115                                 word.r_waiting = 1;
2116                                 ordered_store_rw(lck, word.data);
2117
2118                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
2119                                 res = assert_wait(LCK_RW_READER_EVENT(lck), THREAD_UNINT);
2120                                 lck_interlock_unlock(lck, istate);
2121
2122                                 if (res == THREAD_WAITING) {
2123                                         res = thread_block(THREAD_CONTINUE_NULL);
2124                                         slept++;
2125                                 }
2126                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
2127                                              trace_lck, res, slept, 0, 0);
2128                         } else {
2129                                 word.shared_count++;
2130                                 ordered_store_rw(lck, word.data);
2131                                 lck_interlock_unlock(lck, istate);
2132                                 break;
2133                         }
2134                 }
2135         }
2136
2137 #if     CONFIG_DTRACE
2138         if (dtrace_ls_enabled == TRUE) {
2139                 if (slept == 0) {
2140                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
2141                 } else {
2142                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
2143                             mach_absolute_time() - wait_interval, 0,
2144                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
2145                 }
2146         }
2147         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
2148 #endif  /* CONFIG_DTRACE */
2149 }
2150
2151
2152 void
2153 lck_rw_assert(
2154         lck_rw_t                *lck,
2155         unsigned int    type)
2156 {
2157         switch (type) {
2158         case LCK_RW_ASSERT_SHARED:
2159                 if ((lck->lck_rw_shared_count != 0) &&
2160                     (lck->lck_rw_owner == THREAD_NULL)) {
2161                         return;
2162                 }
2163                 break;
2164         case LCK_RW_ASSERT_EXCLUSIVE:
2165                 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2166                         (lck->lck_rw_shared_count == 0) &&
2167                     (lck->lck_rw_owner == current_thread())) {
2168                         return;
2169                 }
2170                 break;
2171         case LCK_RW_ASSERT_HELD:
2172                 if (lck->lck_rw_shared_count != 0)
2173                         return;         // Held shared
2174                 if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2175                     (lck->lck_rw_owner == current_thread())) {
2176                         return;         // Held exclusive
2177                 }
2178                 break;
2179         case LCK_RW_ASSERT_NOTHELD:
2180                 if ((lck->lck_rw_shared_count == 0) &&
2181                    !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2182                     (lck->lck_rw_owner == THREAD_NULL)) {
2183                         return;
2184                 }
2185                 break;
2186         default:
2187                 break;
2188         }
2189         panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2190 }
2191
2192
2193 /*
2194  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2195  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2196  */
2197 boolean_t
2198 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
2199         if (not_in_kdp) {
2200                 panic("panic: rw lock exclusive check done outside of kernel debugger");
2201         }
2202         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2203 }
2204
2205 /*
2206  * The C portion of the mutex package.  These routines are only invoked
2207  * if the optimized assembler routines can't do the work.
2208  */
2209
2210 /*
2211  * Forward declaration
2212  */
2213
2214 void
2215 lck_mtx_ext_init(
2216                  lck_mtx_ext_t * lck,
2217                  lck_grp_t * grp,
2218                  lck_attr_t * attr);
2219
2220 /*
2221  *      Routine:        lck_mtx_alloc_init
2222  */
2223 lck_mtx_t      *
2224 lck_mtx_alloc_init(
2225                    lck_grp_t * grp,
2226                    lck_attr_t * attr)
2227 {
2228         lck_mtx_t      *lck;
2229
2230         if ((lck = (lck_mtx_t *) kalloc(sizeof(lck_mtx_t))) != 0)
2231                 lck_mtx_init(lck, grp, attr);
2232
2233         return (lck);
2234 }
2235
2236 /*
2237  *      Routine:        lck_mtx_free
2238  */
2239 void
2240 lck_mtx_free(
2241              lck_mtx_t * lck,
2242              lck_grp_t * grp)
2243 {
2244         lck_mtx_destroy(lck, grp);
2245         kfree((void *) lck, sizeof(lck_mtx_t));
2246 }
2247
2248 /*
2249  *      Routine:        lck_mtx_init
2250  */
2251 void
2252 lck_mtx_init(
2253              lck_mtx_t * lck,
2254              lck_grp_t * grp,
2255              lck_attr_t * attr)
2256 {
2257 #ifdef  BER_XXX
2258         lck_mtx_ext_t  *lck_ext;
2259 #endif
2260         lck_attr_t     *lck_attr;
2261
2262         if (attr != LCK_ATTR_NULL)
2263                 lck_attr = attr;
2264         else
2265                 lck_attr = &LockDefaultLckAttr;
2266
2267 #ifdef  BER_XXX
2268         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2269                 if ((lck_ext = (lck_mtx_ext_t *) kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2270                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
2271                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2272                         lck->lck_mtx_ptr = lck_ext;
2273                         lck->lck_mtx_type = LCK_MTX_TYPE;
2274                 }
2275         } else
2276 #endif
2277         {
2278                 lck->lck_mtx_ptr = NULL;                // Clear any padding in the union fields below
2279                 lck->lck_mtx_waiters = 0;
2280                 lck->lck_mtx_pri = 0;
2281                 lck->lck_mtx_type = LCK_MTX_TYPE;
2282                 ordered_store_mtx(lck, 0);
2283         }
2284         lck_grp_reference(grp);
2285         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2286 }
2287
2288 /*
2289  *      Routine:        lck_mtx_init_ext
2290  */
2291 void
2292 lck_mtx_init_ext(
2293                  lck_mtx_t * lck,
2294                  lck_mtx_ext_t * lck_ext,
2295                  lck_grp_t * grp,
2296                  lck_attr_t * attr)
2297 {
2298         lck_attr_t     *lck_attr;
2299
2300         if (attr != LCK_ATTR_NULL)
2301                 lck_attr = attr;
2302         else
2303                 lck_attr = &LockDefaultLckAttr;
2304
2305         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2306                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2307                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2308                 lck->lck_mtx_ptr = lck_ext;
2309                 lck->lck_mtx_type = LCK_MTX_TYPE;
2310         } else {
2311                 lck->lck_mtx_waiters = 0;
2312                 lck->lck_mtx_pri = 0;
2313                 lck->lck_mtx_type = LCK_MTX_TYPE;
2314                 ordered_store_mtx(lck, 0);
2315         }
2316         lck_grp_reference(grp);
2317         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2318 }
2319
2320 /*
2321  *      Routine:        lck_mtx_ext_init
2322  */
2323 void
2324 lck_mtx_ext_init(
2325                  lck_mtx_ext_t * lck,
2326                  lck_grp_t * grp,
2327                  lck_attr_t * attr)
2328 {
2329         bzero((void *) lck, sizeof(lck_mtx_ext_t));
2330
2331         lck->lck_mtx.lck_mtx_type = LCK_MTX_TYPE;
2332
2333         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2334                 lck->lck_mtx_deb.type = MUTEX_TAG;
2335                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2336         }
2337         lck->lck_mtx_grp = grp;
2338
2339         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
2340                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2341 }
2342
2343 /* The slow versions */
2344 static void lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2345 static boolean_t lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread);
2346 static void lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
2347
2348 /*
2349  *      Routine:        lck_mtx_verify
2350  *
2351  *      Verify if a mutex is valid
2352  */
2353 static inline void
2354 lck_mtx_verify(lck_mtx_t *lock)
2355 {
2356         if (lock->lck_mtx_type != LCK_MTX_TYPE)
2357                 panic("Invalid mutex %p", lock);
2358 #if     DEVELOPMENT || DEBUG
2359         if (lock->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
2360                 panic("Mutex destroyed %p", lock);
2361 #endif  /* DEVELOPMENT || DEBUG */
2362 }
2363
2364 /*
2365  *      Routine:        lck_mtx_check_preemption
2366  *
2367  *      Verify preemption is enabled when attempting to acquire a mutex.
2368  */
2369
2370 static inline void
2371 lck_mtx_check_preemption(lck_mtx_t *lock)
2372 {
2373 #if     DEVELOPMENT || DEBUG
2374         int pl = get_preemption_level();
2375
2376         if (pl != 0)
2377                 panic("Attempt to take mutex with preemption disabled. Lock=%p, level=%d", lock, pl);
2378 #else
2379         (void)lock;
2380 #endif
2381 }
2382
2383 /*
2384  *      Routine:        lck_mtx_lock
2385  */
2386 void
2387 lck_mtx_lock(lck_mtx_t *lock)
2388 {
2389         thread_t        thread;
2390
2391         lck_mtx_verify(lock);
2392         lck_mtx_check_preemption(lock);
2393         thread = current_thread();
2394         if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread),
2395                                         memory_order_acquire_smp, FALSE)) {
2396 #if     CONFIG_DTRACE
2397                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2398 #endif /* CONFIG_DTRACE */
2399                 return;
2400         }
2401         lck_mtx_lock_contended(lock, thread, FALSE);
2402 }
2403
2404 /*
2405         This is the slow version of mutex locking.
2406  */
2407 static void NOINLINE
2408 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
2409 {
2410         thread_t        holding_thread;
2411         uintptr_t       state;
2412         int             waiters;
2413
2414         if (interlocked)
2415                 goto interlock_held;
2416
2417         for ( ; ; ) {
2418                 if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread),
2419                                                 memory_order_acquire_smp, FALSE))
2420                         return;
2421                 interlock_lock(lock);
2422 interlock_held:
2423                 state = ordered_load_mtx(lock);
2424                 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2425                 if (holding_thread == NULL)
2426                         break;
2427                 ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
2428                 lck_mtx_lock_wait(lock, holding_thread);
2429         }
2430         waiters = lck_mtx_lock_acquire(lock);
2431         state = LCK_MTX_THREAD_TO_STATE(thread);
2432         if (waiters != 0)
2433                 state |= ARM_LCK_WAITERS;
2434 #if __SMP__
2435         state |= LCK_ILOCK;                             // Preserve interlock
2436         ordered_store_mtx(lock, state); // Set ownership
2437         interlock_unlock(lock);                 // Release interlock, enable preemption
2438 #else
2439         ordered_store_mtx(lock, state); // Set ownership
2440         enable_preemption();
2441 #endif
2442         load_memory_barrier();
2443
2444 #if     CONFIG_DTRACE
2445         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
2446 #endif /* CONFIG_DTRACE */
2447 }
2448
2449 /*
2450  *      Common code for mutex locking as spinlock
2451  */
2452 static inline void
2453 lck_mtx_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2454 {
2455         uintptr_t       state;
2456
2457         interlock_lock(lock);
2458         state = ordered_load_mtx(lock);
2459         if (LCK_MTX_STATE_TO_THREAD(state)) {
2460                 if (allow_held_as_mutex)
2461                         lck_mtx_lock_contended(lock, current_thread(), TRUE);
2462                 else
2463                         // "Always" variants can never block. If the lock is held and blocking is not allowed
2464                         // then someone is mixing always and non-always calls on the same lock, which is
2465                         // forbidden.
2466                         panic("Attempting to block on a lock taken as spin-always %p", lock);
2467                 return;
2468         }
2469         state &= ARM_LCK_WAITERS;                                               // Preserve waiters bit
2470         state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK);        // Add spin tag and maintain interlock
2471         ordered_store_mtx(lock, state);
2472         load_memory_barrier();
2473
2474 #if     CONFIG_DTRACE
2475         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
2476 #endif /* CONFIG_DTRACE */
2477 }
2478
2479 /*
2480  *      Routine:        lck_mtx_lock_spin
2481  */
2482 void
2483 lck_mtx_lock_spin(lck_mtx_t *lock)
2484 {
2485         lck_mtx_check_preemption(lock);
2486         lck_mtx_lock_spin_internal(lock, TRUE);
2487 }
2488
2489 /*
2490  *      Routine:        lck_mtx_lock_spin_always
2491  */
2492 void
2493 lck_mtx_lock_spin_always(lck_mtx_t *lock)
2494 {
2495         lck_mtx_lock_spin_internal(lock, FALSE);
2496 }
2497
2498 /*
2499  *      Routine:        lck_mtx_try_lock
2500  */
2501 boolean_t
2502 lck_mtx_try_lock(lck_mtx_t *lock)
2503 {
2504         thread_t        thread = current_thread();
2505
2506         lck_mtx_verify(lock);
2507         if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread),
2508                                         memory_order_acquire_smp, FALSE)) {
2509 #if     CONFIG_DTRACE
2510                 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
2511 #endif /* CONFIG_DTRACE */
2512                 return TRUE;
2513         }
2514         return lck_mtx_try_lock_contended(lock, thread);
2515 }
2516
2517 static boolean_t NOINLINE
2518 lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
2519 {
2520         thread_t        holding_thread;
2521         uintptr_t       state;
2522         int             waiters;
2523
2524 #if     __SMP__
2525         interlock_lock(lock);
2526         state = ordered_load_mtx(lock);
2527         holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2528         if (holding_thread) {
2529                 interlock_unlock(lock);
2530                 return FALSE;
2531         }
2532 #else
2533         disable_preemption_for_thread(thread);
2534         state = ordered_load_mtx(lock);
2535         if (state & LCK_ILOCK)
2536                 panic("Unexpected interlock set (%p)", lock);
2537         holding_thread = LCK_MTX_STATE_TO_THREAD(state);
2538         if (holding_thread) {
2539                 enable_preemption();
2540                 return FALSE;
2541         }
2542         state |= LCK_ILOCK;
2543         ordered_store_mtx(lock, state);
2544 #endif  // __SMP__
2545         waiters = lck_mtx_lock_acquire(lock);
2546         state = LCK_MTX_THREAD_TO_STATE(thread);
2547         if (waiters != 0)
2548                 state |= ARM_LCK_WAITERS;
2549 #if __SMP__
2550         state |= LCK_ILOCK;                             // Preserve interlock
2551         ordered_store_mtx(lock, state); // Set ownership
2552         interlock_unlock(lock);                 // Release interlock, enable preemption
2553 #else
2554         ordered_store_mtx(lock, state); // Set ownership
2555         enable_preemption();
2556 #endif
2557         load_memory_barrier();
2558         return TRUE;
2559 }
2560
2561 static inline boolean_t
2562 lck_mtx_try_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
2563 {
2564         uintptr_t       state;
2565
2566         if (!interlock_try(lock))
2567                 return FALSE;
2568         state = ordered_load_mtx(lock);
2569         if(LCK_MTX_STATE_TO_THREAD(state)) {
2570                 // Lock is held as mutex
2571                 if (allow_held_as_mutex)
2572                         interlock_unlock(lock);
2573                 else
2574                         // "Always" variants can never block. If the lock is held as a normal mutex
2575                         // then someone is mixing always and non-always calls on the same lock, which is
2576                         // forbidden.
2577                         panic("Spin-mutex held as full mutex %p", lock);
2578                 return FALSE;
2579         }
2580         state &= ARM_LCK_WAITERS;                                               // Preserve waiters bit
2581         state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK);        // Add spin tag and maintain interlock
2582         ordered_store_mtx(lock, state);
2583         load_memory_barrier();
2584
2585 #if     CONFIG_DTRACE
2586         LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
2587 #endif /* CONFIG_DTRACE */
2588         return TRUE;
2589 }
2590
2591 /*
2592  *      Routine: lck_mtx_try_lock_spin
2593  */
2594 boolean_t
2595 lck_mtx_try_lock_spin(lck_mtx_t *lock)
2596 {
2597         return lck_mtx_try_lock_spin_internal(lock, TRUE);
2598 }
2599
2600 /*
2601  *      Routine: lck_mtx_try_lock_spin_always
2602  */
2603 boolean_t
2604 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
2605 {
2606         return lck_mtx_try_lock_spin_internal(lock, FALSE);
2607 }
2608
2609
2610
2611 /*
2612  *      Routine:        lck_mtx_unlock
2613  */
2614 void
2615 lck_mtx_unlock(lck_mtx_t *lock)
2616 {
2617         thread_t        thread = current_thread();
2618         uintptr_t       state;
2619         boolean_t       ilk_held = FALSE;
2620
2621         lck_mtx_verify(lock);
2622
2623         state = ordered_load_mtx(lock);
2624         if (state & LCK_ILOCK) {
2625                 if(LCK_MTX_STATE_TO_THREAD(state) == (thread_t)LCK_MTX_SPIN_TAG)
2626                         ilk_held = TRUE;        // Interlock is held by (presumably) this thread
2627                 goto slow_case;
2628         }
2629         // Locked as a mutex
2630         if (atomic_compare_exchange(&lock->lck_mtx_data, LCK_MTX_THREAD_TO_STATE(thread), 0,
2631                                         memory_order_release_smp, FALSE)) {
2632 #if     CONFIG_DTRACE
2633                 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2634 #endif /* CONFIG_DTRACE */
2635                 return;
2636         }
2637 slow_case:
2638         lck_mtx_unlock_contended(lock, thread, ilk_held);
2639 }
2640
2641 static void NOINLINE
2642 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
2643 {
2644         uintptr_t       state;
2645
2646         if (ilk_held) {
2647                 state = ordered_load_mtx(lock);
2648         } else {
2649 #if     __SMP__
2650                 interlock_lock(lock);
2651                 state = ordered_load_mtx(lock);
2652                 if (thread != LCK_MTX_STATE_TO_THREAD(state))
2653                         panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
2654 #else
2655                 disable_preemption_for_thread(thread);
2656                 state = ordered_load_mtx(lock);
2657                 if (state & LCK_ILOCK)
2658                         panic("lck_mtx_unlock(): Unexpected interlock set (%p)", lock);
2659                 if (thread != LCK_MTX_STATE_TO_THREAD(state))
2660                         panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
2661                 state |= LCK_ILOCK;
2662                 ordered_store_mtx(lock, state);
2663 #endif
2664         }
2665         if (state & ARM_LCK_WAITERS) {
2666                 lck_mtx_unlock_wakeup(lock, thread);
2667                 state = ordered_load_mtx(lock);
2668         } else {
2669                 assertf(lock->lck_mtx_pri == 0, "pri=0x%x", lock->lck_mtx_pri);
2670         }
2671         state &= ARM_LCK_WAITERS;               // Retain waiters bit
2672 #if __SMP__
2673         state |= LCK_ILOCK;
2674         ordered_store_mtx(lock, state);
2675         interlock_unlock(lock);
2676 #else
2677         ordered_store_mtx(lock, state);
2678         enable_preemption();
2679 #endif
2680
2681 #if     CONFIG_DTRACE
2682         LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
2683 #endif /* CONFIG_DTRACE */
2684 }
2685
2686 /*
2687  *      Routine:        lck_mtx_assert
2688  */
2689 void
2690 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
2691 {
2692         thread_t        thread, holder;
2693         uintptr_t       state;
2694
2695         state = ordered_load_mtx(lock);
2696         holder = LCK_MTX_STATE_TO_THREAD(state);
2697         if (holder == (thread_t)LCK_MTX_SPIN_TAG) {
2698                         // Lock is held in spin mode, owner is unknown.
2699                 return; // Punt
2700         }
2701         thread = current_thread();
2702         if (type == LCK_MTX_ASSERT_OWNED) {
2703                 if (thread != holder)
2704                         panic("lck_mtx_assert(): mutex (%p) owned", lock);
2705         } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
2706                 if (thread == holder)
2707                         panic("lck_mtx_assert(): mutex (%p) not owned", lock);
2708         } else
2709                 panic("lck_mtx_assert(): invalid arg (%u)", type);
2710 }
2711
2712 /*
2713  *      Routine:        lck_mtx_ilk_unlock
2714  */
2715 boolean_t
2716 lck_mtx_ilk_unlock(lck_mtx_t *lock)
2717 {
2718         interlock_unlock(lock);
2719         return TRUE;
2720 }
2721
2722 /*
2723  *      Routine:        lck_mtx_convert_spin
2724  *
2725  *      Convert a mutex held for spin into a held full mutex
2726  */
2727 void
2728 lck_mtx_convert_spin(lck_mtx_t *lock)
2729 {
2730         thread_t        thread = current_thread();
2731         uintptr_t       state;
2732         int                     waiters;
2733
2734         state = ordered_load_mtx(lock);
2735         if (LCK_MTX_STATE_TO_THREAD(state) == thread)
2736                 return;         // Already owned as mutex, return
2737         if ((state & LCK_ILOCK) == 0 || (LCK_MTX_STATE_TO_THREAD(state) != (thread_t)LCK_MTX_SPIN_TAG))
2738                 panic("lck_mtx_convert_spin: Not held as spinlock (%p)", lock);
2739         state &= ~(LCK_MTX_THREAD_MASK);                // Clear the spin tag
2740         ordered_store_mtx(lock, state);
2741         waiters = lck_mtx_lock_acquire(lock);   // Acquire to manage priority boosts
2742         state = LCK_MTX_THREAD_TO_STATE(thread);
2743         if (waiters != 0)
2744                 state |= ARM_LCK_WAITERS;
2745 #if __SMP__
2746         state |= LCK_ILOCK;
2747         ordered_store_mtx(lock, state);                 // Set ownership
2748         interlock_unlock(lock);                                 // Release interlock, enable preemption
2749 #else
2750         ordered_store_mtx(lock, state);                 // Set ownership
2751         enable_preemption();
2752 #endif
2753 }
2754
2755
2756 /*
2757  *      Routine:        lck_mtx_destroy
2758  */
2759 void
2760 lck_mtx_destroy(
2761                 lck_mtx_t * lck,
2762                 lck_grp_t * grp)
2763 {
2764         if (lck->lck_mtx_type != LCK_MTX_TYPE)
2765                 panic("Destroying invalid mutex %p", lck);
2766         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
2767                 panic("Destroying previously destroyed lock %p", lck);
2768         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2769         lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
2770         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2771         lck_grp_deallocate(grp);
2772         return;
2773 }
2774
2775 /*
2776  *      Routine:        lck_spin_assert
2777  */
2778 void
2779 lck_spin_assert(lck_spin_t *lock, unsigned int type)
2780 {
2781         thread_t        thread, holder;
2782         uintptr_t       state;
2783
2784         if (lock->type != LCK_SPIN_TYPE)
2785                 panic("Invalid spinlock %p", lock);
2786
2787         state = lock->lck_spin_data;
2788         holder = (thread_t)(state & ~LCK_ILOCK);
2789         thread = current_thread();
2790         if (type == LCK_ASSERT_OWNED) {
2791                 if (holder == 0)
2792                         panic("Lock not owned %p = %lx", lock, state);
2793                 if (holder != thread)
2794                         panic("Lock not owned by current thread %p = %lx", lock, state);
2795                 if ((state & LCK_ILOCK) == 0)
2796                         panic("Lock bit not set %p = %lx", lock, state);
2797         } else if (type == LCK_ASSERT_NOTOWNED) {
2798                 if (holder != 0) {
2799                         if (holder == thread)
2800                                 panic("Lock owned by current thread %p = %lx", lock, state);
2801                         else
2802                                 panic("Lock %p owned by thread %p", lock, holder);
2803                 }
2804                 if (state & LCK_ILOCK)
2805                         panic("Lock bit set %p = %lx", lock, state);
2806         } else
2807                 panic("lck_spin_assert(): invalid arg (%u)", type);
2808 }
2809
2810 boolean_t
2811 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2812 {
2813         lck_rw_word_t   word;
2814
2815         lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2816
2817         word.data = ordered_load_rw(lck);
2818         if (word.want_excl || word.want_upgrade || force_yield) {
2819                 lck_rw_unlock_shared(lck);
2820                 mutex_pause(2);
2821                 lck_rw_lock_shared(lck);
2822                 return TRUE;
2823         }
2824
2825         return FALSE;
2826 }
2827
2828 /*
2829  * Routine: kdp_lck_mtx_lock_spin_is_acquired
2830  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2831  */
2832 boolean_t
2833 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
2834 {
2835         uintptr_t       state;
2836
2837         if (not_in_kdp) {
2838                 panic("panic: spinlock acquired check done outside of kernel debugger");
2839         }
2840         state = ordered_load_mtx(lck);
2841         if (state == LCK_MTX_TAG_DESTROYED)
2842                 return FALSE;
2843         if (LCK_MTX_STATE_TO_THREAD(state) || (state & LCK_ILOCK))
2844                 return TRUE;
2845         return FALSE;
2846 }
2847
2848 void
2849 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2850 {
2851         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
2852         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2853         uintptr_t state   = ordered_load_mtx(mutex);
2854         thread_t holder   = LCK_MTX_STATE_TO_THREAD(state);
2855         if ((uintptr_t)holder == (uintptr_t)LCK_MTX_SPIN_TAG) {
2856                 waitinfo->owner = STACKSHOT_WAITOWNER_MTXSPIN;
2857         } else {
2858                 assertf(state != (uintptr_t)LCK_MTX_TAG_DESTROYED, "state=0x%llx", (uint64_t)state);
2859                 assertf(state != (uintptr_t)LCK_MTX_TAG_INDIRECT, "state=0x%llx", (uint64_t)state);
2860                 waitinfo->owner = thread_tid(holder);
2861         }
2862 }
2863
2864 void
2865 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2866 {
2867         lck_rw_t        *rwlck = NULL;
2868         switch(waitinfo->wait_type) {
2869                 case kThreadWaitKernelRWLockRead:
2870                         rwlck = READ_EVENT_TO_RWLOCK(event);
2871                         break;
2872                 case kThreadWaitKernelRWLockWrite:
2873                 case kThreadWaitKernelRWLockUpgrade:
2874                         rwlck = WRITE_EVENT_TO_RWLOCK(event);
2875                         break;
2876                 default:
2877                         panic("%s was called with an invalid blocking type", __FUNCTION__);
2878                         break;
2879         }
2880         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2881         waitinfo->owner = thread_tid(rwlck->lck_rw_owner);
2882 }