osfmk/i386/locks_i386.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  *      File:   kern/lock.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *      Date:   1985
  60  *
  61  *      Locking primitives implementation
  62  */
  63
  64 #define LOCK_PRIVATE 1
  65
  66 #include <mach_ldebug.h>
  67
  68 #include <kern/lock_stat.h>
  69 #include <kern/locks.h>
  70 #include <kern/kalloc.h>
  71 #include <kern/misc_protos.h>
  72 #include <kern/thread.h>
  73 #include <kern/processor.h>
  74 #include <kern/cpu_data.h>
  75 #include <kern/cpu_number.h>
  76 #include <kern/sched_prim.h>
  77 #include <kern/debug.h>
  78 #include <string.h>
  79
  80 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
  81 #include <machine/atomic.h>
  82 #include <machine/machine_cpu.h>
  83 #include <i386/mp.h>
  84 #include <machine/atomic.h>
  85 #include <sys/kdebug.h>
  86 #include <i386/locks_i386_inlines.h>
  87 #include <kern/cpu_number.h>
  88 #include <os/hash.h>
  89
  90 #if     CONFIG_DTRACE
  91 #define DTRACE_RW_SHARED        0x0     //reader
  92 #define DTRACE_RW_EXCL          0x1     //writer
  93 #define DTRACE_NO_FLAG          0x0     //not applicable
  94 #endif /* CONFIG_DTRACE */
  95
  96 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  97 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
  98 #define LCK_RW_LCK_SHARED_CODE          0x102
  99 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
 100 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
 101 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 102
 103 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
 104 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
 105 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
 106 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
 107 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
 108 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
 109 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
 110 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
 111
 112
 113 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 114
 115 unsigned int LcksOpts = 0;
 116
 117 #if DEVELOPMENT || DEBUG
 118 unsigned int LckDisablePreemptCheck = 0;
 119 #endif
 120
 121 /* Forwards */
 122
 123 #if     USLOCK_DEBUG
 124 /*
 125  *      Perform simple lock checks.
 126  */
 127 int     uslock_check = 1;
 128 int     max_lock_loops  = 100000000;
 129 decl_simple_lock_data(extern, printf_lock);
 130 decl_simple_lock_data(extern, panic_lock);
 131 #endif  /* USLOCK_DEBUG */
 132
 133 extern unsigned int not_in_kdp;
 134
 135 /*
 136  *      We often want to know the addresses of the callers
 137  *      of the various lock routines.  However, this information
 138  *      is only used for debugging and statistics.
 139  */
 140 typedef void    *pc_t;
 141 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 142 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 143 #if     ANY_LOCK_DEBUG
 144 #define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
 145 #define DECL_PC(pc)     pc_t pc;
 146 #else   /* ANY_LOCK_DEBUG */
 147 #define DECL_PC(pc)
 148 #ifdef  lint
 149 /*
 150  *      Eliminate lint complaints about unused local pc variables.
 151  */
 152 #define OBTAIN_PC(pc)   ++pc
 153 #else   /* lint */
 154 #define OBTAIN_PC(pc)
 155 #endif  /* lint */
 156 #endif  /* USLOCK_DEBUG */
 157
 158 /*
 159  * atomic exchange API is a low level abstraction of the operations
 160  * to atomically read, modify, and write a pointer.  This abstraction works
 161  * for both Intel and ARMv8.1 compare and exchange atomic instructions as
 162  * well as the ARM exclusive instructions.
 163  *
 164  * atomic_exchange_begin() - begin exchange and retrieve current value
 165  * atomic_exchange_complete() - conclude an exchange
 166  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
 167  */
 168 static uint32_t
 169 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 170 {
 171         uint32_t        val;
 172
 173         (void)ord;                      // Memory order not used
 174         val = os_atomic_load(target, relaxed);
 175         *previous = val;
 176         return val;
 177 }
 178
 179 static boolean_t
 180 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 181 {
 182         return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
 183 }
 184
 185 static void
 186 atomic_exchange_abort(void)
 187 {
 188 }
 189
 190 static boolean_t
 191 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 192 {
 193         uint32_t        value, prev;
 194
 195         for (;;) {
 196                 value = atomic_exchange_begin32(target, &prev, ord);
 197                 if (value & test_mask) {
 198                         if (wait) {
 199                                 cpu_pause();
 200                         } else {
 201                                 atomic_exchange_abort();
 202                         }
 203                         return FALSE;
 204                 }
 205                 value |= set_mask;
 206                 if (atomic_exchange_complete32(target, prev, value, ord)) {
 207                         return TRUE;
 208                 }
 209         }
 210 }
 211
 212 inline boolean_t
 213 hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 214 {
 215         return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
 216 }
 217
 218 /*
 219  *      Portable lock package implementation of usimple_locks.
 220  */
 221
 222 #if     USLOCK_DEBUG
 223 #define USLDBG(stmt)    stmt
 224 void            usld_lock_init(usimple_lock_t, unsigned short);
 225 void            usld_lock_pre(usimple_lock_t, pc_t);
 226 void            usld_lock_post(usimple_lock_t, pc_t);
 227 void            usld_unlock(usimple_lock_t, pc_t);
 228 void            usld_lock_try_pre(usimple_lock_t, pc_t);
 229 void            usld_lock_try_post(usimple_lock_t, pc_t);
 230 int             usld_lock_common_checks(usimple_lock_t, char *);
 231 #else   /* USLOCK_DEBUG */
 232 #define USLDBG(stmt)
 233 #endif  /* USLOCK_DEBUG */
 234
 235 /*
 236  * Forward definitions
 237  */
 238
 239 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
 240 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
 241 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 242 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 243 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 244 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 245 void lck_rw_clear_promotions_x86(thread_t thread);
 246 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
 247 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
 248 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
 249 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect);
 250 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
 251 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
 252 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
 253 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
 254 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
 255
 256
 257 /*
 258  *      Routine:        lck_spin_alloc_init
 259  */
 260 lck_spin_t *
 261 lck_spin_alloc_init(
 262         lck_grp_t       *grp,
 263         lck_attr_t      *attr)
 264 {
 265         lck_spin_t      *lck;
 266
 267         if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) {
 268                 lck_spin_init(lck, grp, attr);
 269         }
 270
 271         return lck;
 272 }
 273
 274 /*
 275  *      Routine:        lck_spin_free
 276  */
 277 void
 278 lck_spin_free(
 279         lck_spin_t      *lck,
 280         lck_grp_t       *grp)
 281 {
 282         lck_spin_destroy(lck, grp);
 283         kfree(lck, sizeof(lck_spin_t));
 284 }
 285
 286 /*
 287  *      Routine:        lck_spin_init
 288  */
 289 void
 290 lck_spin_init(
 291         lck_spin_t      *lck,
 292         lck_grp_t       *grp,
 293         __unused lck_attr_t     *attr)
 294 {
 295         usimple_lock_init((usimple_lock_t) lck, 0);
 296         if (grp) {
 297                 lck_grp_reference(grp);
 298                 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 299         }
 300 }
 301
 302 /*
 303  *      Routine:        lck_spin_destroy
 304  */
 305 void
 306 lck_spin_destroy(
 307         lck_spin_t      *lck,
 308         lck_grp_t       *grp)
 309 {
 310         if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
 311                 return;
 312         }
 313         lck->interlock = LCK_SPIN_TAG_DESTROYED;
 314         if (grp) {
 315                 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 316                 lck_grp_deallocate(grp);
 317         }
 318         return;
 319 }
 320
 321 /*
 322  *      Routine:        lck_spin_lock
 323  */
 324 void
 325 lck_spin_lock_grp(
 326         lck_spin_t      *lck,
 327         lck_grp_t       *grp)
 328 {
 329 #pragma unused(grp)
 330         usimple_lock((usimple_lock_t) lck, grp);
 331 }
 332
 333 void
 334 lck_spin_lock(
 335         lck_spin_t      *lck)
 336 {
 337         usimple_lock((usimple_lock_t) lck, NULL);
 338 }
 339
 340 /*
 341  *      Routine:        lck_spin_unlock
 342  */
 343 void
 344 lck_spin_unlock(
 345         lck_spin_t      *lck)
 346 {
 347         usimple_unlock((usimple_lock_t) lck);
 348 }
 349
 350 boolean_t
 351 lck_spin_try_lock_grp(
 352         lck_spin_t      *lck,
 353         lck_grp_t       *grp)
 354 {
 355 #pragma unused(grp)
 356         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
 357 #if     DEVELOPMENT || DEBUG
 358         if (lrval) {
 359                 pltrace(FALSE);
 360         }
 361 #endif
 362         return lrval;
 363 }
 364
 365
 366 /*
 367  *      Routine:        lck_spin_try_lock
 368  */
 369 boolean_t
 370 lck_spin_try_lock(
 371         lck_spin_t      *lck)
 372 {
 373         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
 374 #if     DEVELOPMENT || DEBUG
 375         if (lrval) {
 376                 pltrace(FALSE);
 377         }
 378 #endif
 379         return lrval;
 380 }
 381
 382 /*
 383  *      Routine:        lck_spin_assert
 384  */
 385 void
 386 lck_spin_assert(lck_spin_t *lock, unsigned int type)
 387 {
 388         thread_t thread, holder;
 389         uintptr_t state;
 390
 391         if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
 392                 panic("lck_spin_assert(): invalid arg (%u)", type);
 393         }
 394
 395         state = lock->interlock;
 396         holder = (thread_t)state;
 397         thread = current_thread();
 398         if (type == LCK_ASSERT_OWNED) {
 399                 if (__improbable(holder == THREAD_NULL)) {
 400                         panic("Lock not owned %p = %lx", lock, state);
 401                 }
 402                 if (__improbable(holder != thread)) {
 403                         panic("Lock not owned by current thread %p = %lx", lock, state);
 404                 }
 405         } else if (type == LCK_ASSERT_NOTOWNED) {
 406                 if (__improbable(holder != THREAD_NULL)) {
 407                         if (holder == thread) {
 408                                 panic("Lock owned by current thread %p = %lx", lock, state);
 409                         }
 410                 }
 411         }
 412 }
 413
 414 /*
 415  *      Routine: kdp_lck_spin_is_acquired
 416  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 417  *      Returns: TRUE if lock is acquired.
 418  */
 419 boolean_t
 420 kdp_lck_spin_is_acquired(lck_spin_t *lck)
 421 {
 422         if (not_in_kdp) {
 423                 panic("panic: spinlock acquired check done outside of kernel debugger");
 424         }
 425         return (lck->interlock != 0)? TRUE : FALSE;
 426 }
 427
 428 /*
 429  *      Initialize a usimple_lock.
 430  *
 431  *      No change in preemption state.
 432  */
 433 void
 434 usimple_lock_init(
 435         usimple_lock_t  l,
 436         __unused unsigned short tag)
 437 {
 438 #ifndef MACHINE_SIMPLE_LOCK
 439         USLDBG(usld_lock_init(l, tag));
 440         hw_lock_init(&l->interlock);
 441 #else
 442         simple_lock_init((simple_lock_t)l, tag);
 443 #endif
 444 }
 445
 446 volatile uint32_t spinlock_owner_cpu = ~0;
 447 volatile usimple_lock_t spinlock_timed_out;
 448
 449 uint32_t
 450 spinlock_timeout_NMI(uintptr_t thread_addr)
 451 {
 452         uint32_t i;
 453
 454         for (i = 0; i < real_ncpus; i++) {
 455                 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
 456                         spinlock_owner_cpu = i;
 457                         if ((uint32_t) cpu_number() != i) {
 458                                 /* Cause NMI and panic on the owner's cpu */
 459                                 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
 460                         }
 461                         break;
 462                 }
 463         }
 464
 465         return spinlock_owner_cpu;
 466 }
 467
 468 /*
 469  *      Acquire a usimple_lock.
 470  *
 471  *      Returns with preemption disabled.  Note
 472  *      that the hw_lock routines are responsible for
 473  *      maintaining preemption state.
 474  */
 475 void
 476 (usimple_lock)(
 477         usimple_lock_t  l
 478         LCK_GRP_ARG(lck_grp_t *grp))
 479 {
 480 #ifndef MACHINE_SIMPLE_LOCK
 481         DECL_PC(pc);
 482
 483         OBTAIN_PC(pc);
 484         USLDBG(usld_lock_pre(l, pc));
 485
 486         if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
 487                 boolean_t uslock_acquired = FALSE;
 488                 while (machine_timeout_suspended()) {
 489                         enable_preemption();
 490                         if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) {
 491                                 break;
 492                         }
 493                 }
 494
 495                 if (uslock_acquired == FALSE) {
 496                         uint32_t lock_cpu;
 497                         uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
 498                         spinlock_timed_out = l;
 499                         lock_cpu = spinlock_timeout_NMI(lowner);
 500                         panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
 501                             l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
 502                 }
 503         }
 504 #if DEVELOPMENT || DEBUG
 505         pltrace(FALSE);
 506 #endif
 507
 508         USLDBG(usld_lock_post(l, pc));
 509 #else
 510         simple_lock((simple_lock_t)l, grp);
 511 #endif
 512 #if CONFIG_DTRACE
 513         LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
 514 #endif
 515 }
 516
 517
 518 /*
 519  *      Release a usimple_lock.
 520  *
 521  *      Returns with preemption enabled.  Note
 522  *      that the hw_lock routines are responsible for
 523  *      maintaining preemption state.
 524  */
 525 void
 526 usimple_unlock(
 527         usimple_lock_t  l)
 528 {
 529 #ifndef MACHINE_SIMPLE_LOCK
 530         DECL_PC(pc);
 531
 532         OBTAIN_PC(pc);
 533         USLDBG(usld_unlock(l, pc));
 534 #if DEVELOPMENT || DEBUG
 535         pltrace(TRUE);
 536 #endif
 537         hw_lock_unlock(&l->interlock);
 538 #else
 539         simple_unlock_rwmb((simple_lock_t)l);
 540 #endif
 541 }
 542
 543
 544 /*
 545  *      Conditionally acquire a usimple_lock.
 546  *
 547  *      On success, returns with preemption disabled.
 548  *      On failure, returns with preemption in the same state
 549  *      as when first invoked.  Note that the hw_lock routines
 550  *      are responsible for maintaining preemption state.
 551  *
 552  *      XXX No stats are gathered on a miss; I preserved this
 553  *      behavior from the original assembly-language code, but
 554  *      doesn't it make sense to log misses?  XXX
 555  */
 556 unsigned int
 557 usimple_lock_try(
 558         usimple_lock_t  l,
 559         lck_grp_t *grp)
 560 {
 561 #ifndef MACHINE_SIMPLE_LOCK
 562         unsigned int    success;
 563         DECL_PC(pc);
 564
 565         OBTAIN_PC(pc);
 566         USLDBG(usld_lock_try_pre(l, pc));
 567         if ((success = hw_lock_try(&l->interlock, grp))) {
 568 #if DEVELOPMENT || DEBUG
 569                 pltrace(FALSE);
 570 #endif
 571                 USLDBG(usld_lock_try_post(l, pc));
 572         }
 573         return success;
 574 #else
 575         return simple_lock_try((simple_lock_t)l, grp);
 576 #endif
 577 }
 578
 579 /*
 580  * Acquire a usimple_lock while polling for pending cpu signals
 581  * and spinning on a lock.
 582  *
 583  */
 584 unsigned
 585 int
 586 (usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
 587     uint64_t deadline
 588     LCK_GRP_ARG(lck_grp_t *grp))
 589 {
 590         boolean_t istate = ml_get_interrupts_enabled();
 591
 592         if (deadline < mach_absolute_time()) {
 593                 return 0;
 594         }
 595
 596         while (!simple_lock_try(l, grp)) {
 597                 if (!istate) {
 598                         cpu_signal_handler(NULL);
 599                 }
 600
 601                 if (deadline < mach_absolute_time()) {
 602                         return 0;
 603                 }
 604
 605                 cpu_pause();
 606         }
 607
 608         return 1;
 609 }
 610
 611 void
 612 (usimple_lock_try_lock_loop)(usimple_lock_t l
 613     LCK_GRP_ARG(lck_grp_t *grp))
 614 {
 615         usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
 616 }
 617
 618 unsigned
 619 int
 620 (usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
 621     uint64_t duration
 622     LCK_GRP_ARG(lck_grp_t *grp))
 623 {
 624         uint64_t deadline;
 625         uint64_t base_at = mach_absolute_time();
 626         uint64_t duration_at;
 627
 628         nanoseconds_to_absolutetime(duration, &duration_at);
 629         deadline = base_at + duration_at;
 630         if (deadline < base_at) {
 631                 /* deadline has overflowed, make it saturate */
 632                 deadline = ULLONG_MAX;
 633         }
 634
 635         return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
 636 }
 637
 638 #if     USLOCK_DEBUG
 639 /*
 640  *      States of a usimple_lock.  The default when initializing
 641  *      a usimple_lock is setting it up for debug checking.
 642  */
 643 #define USLOCK_CHECKED          0x0001          /* lock is being checked */
 644 #define USLOCK_TAKEN            0x0002          /* lock has been taken */
 645 #define USLOCK_INIT             0xBAA0          /* lock has been initialized */
 646 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 647 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 648                                  ((l)->debug.state & USLOCK_CHECKED))
 649
 650 /*
 651  *      Initialize the debugging information contained
 652  *      in a usimple_lock.
 653  */
 654 void
 655 usld_lock_init(
 656         usimple_lock_t  l,
 657         __unused unsigned short tag)
 658 {
 659         if (l == USIMPLE_LOCK_NULL) {
 660                 panic("lock initialization:  null lock pointer");
 661         }
 662         l->lock_type = USLOCK_TAG;
 663         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 664         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 665         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 666         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 667         l->debug.duration[0] = l->debug.duration[1] = 0;
 668         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 669         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 670         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 671 }
 672
 673
 674 /*
 675  *      These checks apply to all usimple_locks, not just
 676  *      those with USLOCK_CHECKED turned on.
 677  */
 678 int
 679 usld_lock_common_checks(
 680         usimple_lock_t  l,
 681         char            *caller)
 682 {
 683         if (l == USIMPLE_LOCK_NULL) {
 684                 panic("%s:  null lock pointer", caller);
 685         }
 686         if (l->lock_type != USLOCK_TAG) {
 687                 panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
 688         }
 689         if (!(l->debug.state & USLOCK_INIT)) {
 690                 panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
 691         }
 692         return USLOCK_CHECKING(l);
 693 }
 694
 695
 696 /*
 697  *      Debug checks on a usimple_lock just before attempting
 698  *      to acquire it.
 699  */
 700 /* ARGSUSED */
 701 void
 702 usld_lock_pre(
 703         usimple_lock_t  l,
 704         pc_t            pc)
 705 {
 706         char    caller[] = "usimple_lock";
 707
 708
 709         if (!usld_lock_common_checks(l, caller)) {
 710                 return;
 711         }
 712
 713 /*
 714  *      Note that we have a weird case where we are getting a lock when we are]
 715  *      in the process of putting the system to sleep. We are running with no
 716  *      current threads, therefore we can't tell if we are trying to retake a lock
 717  *      we have or someone on the other processor has it.  Therefore we just
 718  *      ignore this test if the locking thread is 0.
 719  */
 720
 721         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 722             l->debug.lock_thread == (void *) current_thread()) {
 723                 printf("%s:  lock %p already locked (at %p) by",
 724                     caller, l, l->debug.lock_pc);
 725                 printf(" current thread %p (new attempt at pc %p)\n",
 726                     l->debug.lock_thread, pc);
 727                 panic("%s", caller);
 728         }
 729         mp_disable_preemption();
 730         mp_enable_preemption();
 731 }
 732
 733
 734 /*
 735  *      Debug checks on a usimple_lock just after acquiring it.
 736  *
 737  *      Pre-emption has been disabled at this point,
 738  *      so we are safe in using cpu_number.
 739  */
 740 void
 741 usld_lock_post(
 742         usimple_lock_t  l,
 743         pc_t            pc)
 744 {
 745         int     mycpu;
 746         char    caller[] = "successful usimple_lock";
 747
 748
 749         if (!usld_lock_common_checks(l, caller)) {
 750                 return;
 751         }
 752
 753         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
 754                 panic("%s:  lock %p became uninitialized",
 755                     caller, l);
 756         }
 757         if ((l->debug.state & USLOCK_TAKEN)) {
 758                 panic("%s:  lock 0x%p became TAKEN by someone else",
 759                     caller, l);
 760         }
 761
 762         mycpu = cpu_number();
 763         l->debug.lock_thread = (void *)current_thread();
 764         l->debug.state |= USLOCK_TAKEN;
 765         l->debug.lock_pc = pc;
 766         l->debug.lock_cpu = mycpu;
 767 }
 768
 769
 770 /*
 771  *      Debug checks on a usimple_lock just before
 772  *      releasing it.  Note that the caller has not
 773  *      yet released the hardware lock.
 774  *
 775  *      Preemption is still disabled, so there's
 776  *      no problem using cpu_number.
 777  */
 778 void
 779 usld_unlock(
 780         usimple_lock_t  l,
 781         pc_t            pc)
 782 {
 783         int     mycpu;
 784         char    caller[] = "usimple_unlock";
 785
 786
 787         if (!usld_lock_common_checks(l, caller)) {
 788                 return;
 789         }
 790
 791         mycpu = cpu_number();
 792
 793         if (!(l->debug.state & USLOCK_TAKEN)) {
 794                 panic("%s:  lock 0x%p hasn't been taken",
 795                     caller, l);
 796         }
 797         if (l->debug.lock_thread != (void *) current_thread()) {
 798                 panic("%s:  unlocking lock 0x%p, owned by thread %p",
 799                     caller, l, l->debug.lock_thread);
 800         }
 801         if (l->debug.lock_cpu != mycpu) {
 802                 printf("%s:  unlocking lock 0x%p on cpu 0x%x",
 803                     caller, l, mycpu);
 804                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 805                 panic("%s", caller);
 806         }
 807
 808         l->debug.unlock_thread = l->debug.lock_thread;
 809         l->debug.lock_thread = INVALID_PC;
 810         l->debug.state &= ~USLOCK_TAKEN;
 811         l->debug.unlock_pc = pc;
 812         l->debug.unlock_cpu = mycpu;
 813 }
 814
 815
 816 /*
 817  *      Debug checks on a usimple_lock just before
 818  *      attempting to acquire it.
 819  *
 820  *      Preemption isn't guaranteed to be disabled.
 821  */
 822 void
 823 usld_lock_try_pre(
 824         usimple_lock_t  l,
 825         __unused pc_t   pc)
 826 {
 827         char    caller[] = "usimple_lock_try";
 828
 829         if (!usld_lock_common_checks(l, caller)) {
 830                 return;
 831         }
 832 }
 833
 834
 835 /*
 836  *      Debug checks on a usimple_lock just after
 837  *      successfully attempting to acquire it.
 838  *
 839  *      Preemption has been disabled by the
 840  *      lock acquisition attempt, so it's safe
 841  *      to use cpu_number.
 842  */
 843 void
 844 usld_lock_try_post(
 845         usimple_lock_t  l,
 846         pc_t            pc)
 847 {
 848         int     mycpu;
 849         char    caller[] = "successful usimple_lock_try";
 850
 851         if (!usld_lock_common_checks(l, caller)) {
 852                 return;
 853         }
 854
 855         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
 856                 panic("%s:  lock 0x%p became uninitialized",
 857                     caller, l);
 858         }
 859         if ((l->debug.state & USLOCK_TAKEN)) {
 860                 panic("%s:  lock 0x%p became TAKEN by someone else",
 861                     caller, l);
 862         }
 863
 864         mycpu = cpu_number();
 865         l->debug.lock_thread = (void *) current_thread();
 866         l->debug.state |= USLOCK_TAKEN;
 867         l->debug.lock_pc = pc;
 868         l->debug.lock_cpu = mycpu;
 869 }
 870 #endif  /* USLOCK_DEBUG */
 871
 872 /*
 873  *      Routine:        lck_rw_alloc_init
 874  */
 875 lck_rw_t *
 876 lck_rw_alloc_init(
 877         lck_grp_t       *grp,
 878         lck_attr_t      *attr)
 879 {
 880         lck_rw_t        *lck;
 881
 882         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 883                 bzero(lck, sizeof(lck_rw_t));
 884                 lck_rw_init(lck, grp, attr);
 885         }
 886
 887         return lck;
 888 }
 889
 890 /*
 891  *      Routine:        lck_rw_free
 892  */
 893 void
 894 lck_rw_free(
 895         lck_rw_t        *lck,
 896         lck_grp_t       *grp)
 897 {
 898         lck_rw_destroy(lck, grp);
 899         kfree(lck, sizeof(lck_rw_t));
 900 }
 901
 902 /*
 903  *      Routine:        lck_rw_init
 904  */
 905 void
 906 lck_rw_init(
 907         lck_rw_t        *lck,
 908         lck_grp_t       *grp,
 909         lck_attr_t      *attr)
 910 {
 911         lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
 912             attr : &LockDefaultLckAttr;
 913
 914         hw_lock_byte_init(&lck->lck_rw_interlock);
 915         lck->lck_rw_want_write = FALSE;
 916         lck->lck_rw_want_upgrade = FALSE;
 917         lck->lck_rw_shared_count = 0;
 918         lck->lck_rw_can_sleep = TRUE;
 919         lck->lck_r_waiting = lck->lck_w_waiting = 0;
 920         lck->lck_rw_tag = 0;
 921         lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
 922             LCK_ATTR_RW_SHARED_PRIORITY) == 0);
 923
 924         lck_grp_reference(grp);
 925         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 926 }
 927
 928 /*
 929  *      Routine:        lck_rw_destroy
 930  */
 931 void
 932 lck_rw_destroy(
 933         lck_rw_t        *lck,
 934         lck_grp_t       *grp)
 935 {
 936         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
 937                 return;
 938         }
 939 #if MACH_LDEBUG
 940         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 941 #endif
 942         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 943         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 944         lck_grp_deallocate(grp);
 945         return;
 946 }
 947
 948 /*
 949  *      Sleep locks.  These use the same data structure and algorithm
 950  *      as the spin locks, but the process sleeps while it is waiting
 951  *      for the lock.  These work on uniprocessor systems.
 952  */
 953
 954 #define DECREMENTER_TIMEOUT 1000000
 955
 956 /*
 957  * We disable interrupts while holding the RW interlock to prevent an
 958  * interrupt from exacerbating hold time.
 959  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 960  */
 961 static inline boolean_t
 962 lck_interlock_lock(lck_rw_t *lck)
 963 {
 964         boolean_t       istate;
 965
 966         istate = ml_set_interrupts_enabled(FALSE);
 967         hw_lock_byte_lock(&lck->lck_rw_interlock);
 968         return istate;
 969 }
 970
 971 static inline void
 972 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 973 {
 974         hw_lock_byte_unlock(&lck->lck_rw_interlock);
 975         ml_set_interrupts_enabled(istate);
 976 }
 977
 978 /*
 979  * This inline is used when busy-waiting for an rw lock.
 980  * If interrupts were disabled when the lock primitive was called,
 981  * we poll the IPI handler for pending tlb flushes.
 982  * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
 983  */
 984 static inline void
 985 lck_rw_lock_pause(boolean_t interrupts_enabled)
 986 {
 987         if (!interrupts_enabled) {
 988                 handle_pending_TLB_flushes();
 989         }
 990         cpu_pause();
 991 }
 992
 993 static inline boolean_t
 994 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
 995 {
 996         if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) {
 997                 return TRUE;
 998         }
 999         return FALSE;
1000 }
1001
1002 /*
1003  * compute the deadline to spin against when
1004  * waiting for a change of state on a lck_rw_t
1005  */
1006 static inline uint64_t
1007 lck_rw_deadline_for_spin(lck_rw_t *lck)
1008 {
1009         if (lck->lck_rw_can_sleep) {
1010                 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
1011                         /*
1012                          * there are already threads waiting on this lock... this
1013                          * implies that they have spun beyond their deadlines waiting for
1014                          * the desired state to show up so we will not bother spinning at this time...
1015                          *   or
1016                          * the current number of threads sharing this lock exceeds our capacity to run them
1017                          * concurrently and since all states we're going to spin for require the rw_shared_count
1018                          * to be at 0, we'll not bother spinning since the latency for this to happen is
1019                          * unpredictable...
1020                          */
1021                         return mach_absolute_time();
1022                 }
1023                 return mach_absolute_time() + MutexSpin;
1024         } else {
1025                 return mach_absolute_time() + (100000LL * 1000000000LL);
1026         }
1027 }
1028
1029
1030 /*
1031  * Spin while interlock is held.
1032  */
1033
1034 static inline void
1035 lck_rw_interlock_spin(lck_rw_t *lock)
1036 {
1037         while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
1038                 cpu_pause();
1039         }
1040 }
1041
1042 static boolean_t
1043 lck_rw_grab_want(lck_rw_t *lock)
1044 {
1045         uint32_t        data, prev;
1046
1047         for (;;) {
1048                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
1049                 if ((data & LCK_RW_INTERLOCK) == 0) {
1050                         break;
1051                 }
1052                 atomic_exchange_abort();
1053                 lck_rw_interlock_spin(lock);
1054         }
1055         if (data & LCK_RW_WANT_WRITE) {
1056                 atomic_exchange_abort();
1057                 return FALSE;
1058         }
1059         data |= LCK_RW_WANT_WRITE;
1060         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1061 }
1062
1063 static boolean_t
1064 lck_rw_grab_shared(lck_rw_t *lock)
1065 {
1066         uint32_t        data, prev;
1067
1068         for (;;) {
1069                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1070                 if ((data & LCK_RW_INTERLOCK) == 0) {
1071                         break;
1072                 }
1073                 atomic_exchange_abort();
1074                 lck_rw_interlock_spin(lock);
1075         }
1076         if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1077                 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1078                         atomic_exchange_abort();
1079                         return FALSE;
1080                 }
1081         }
1082         data += LCK_RW_SHARED_READER;
1083         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1084 }
1085
1086 /*
1087  *      Routine:        lck_rw_lock_exclusive
1088  */
1089 static void
1090 lck_rw_lock_exclusive_gen(
1091         lck_rw_t        *lck)
1092 {
1093         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1094         uint64_t        deadline = 0;
1095         int             slept = 0;
1096         int             gotlock = 0;
1097         int             lockheld = 0;
1098         wait_result_t   res = 0;
1099         boolean_t       istate = -1;
1100
1101 #if     CONFIG_DTRACE
1102         boolean_t dtrace_ls_initialized = FALSE;
1103         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
1104         uint64_t wait_interval = 0;
1105         int readers_at_sleep = 0;
1106 #endif
1107
1108         /*
1109          *      Try to acquire the lck_rw_want_write bit.
1110          */
1111         while (!lck_rw_grab_want(lck)) {
1112 #if     CONFIG_DTRACE
1113                 if (dtrace_ls_initialized == FALSE) {
1114                         dtrace_ls_initialized = TRUE;
1115                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1116                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1117                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1118                         if (dtrace_ls_enabled) {
1119                                 /*
1120                                  * Either sleeping or spinning is happening,
1121                                  *  start a timing of our delay interval now.
1122                                  */
1123                                 readers_at_sleep = lck->lck_rw_shared_count;
1124                                 wait_interval = mach_absolute_time();
1125                         }
1126                 }
1127 #endif
1128                 if (istate == -1) {
1129                         istate = ml_get_interrupts_enabled();
1130                 }
1131
1132                 deadline = lck_rw_deadline_for_spin(lck);
1133
1134                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1135
1136                 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) {
1137                         lck_rw_lock_pause(istate);
1138                 }
1139
1140                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1141
1142                 if (gotlock) {
1143                         break;
1144                 }
1145                 /*
1146                  * if we get here, the deadline has expired w/o us
1147                  * being able to grab the lock exclusively
1148                  * check to see if we're allowed to do a thread_block
1149                  */
1150                 if (lck->lck_rw_can_sleep) {
1151                         istate = lck_interlock_lock(lck);
1152
1153                         if (lck->lck_rw_want_write) {
1154                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1155
1156                                 lck->lck_w_waiting = TRUE;
1157
1158                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1159                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1160                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1161                                 lck_interlock_unlock(lck, istate);
1162
1163                                 if (res == THREAD_WAITING) {
1164                                         res = thread_block(THREAD_CONTINUE_NULL);
1165                                         slept++;
1166                                 }
1167                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1168                         } else {
1169                                 lck->lck_rw_want_write = TRUE;
1170                                 lck_interlock_unlock(lck, istate);
1171                                 break;
1172                         }
1173                 }
1174         }
1175         /*
1176          * Wait for readers (and upgrades) to finish...
1177          * the test for these conditions must be done simultaneously with
1178          * a check of the interlock not being held since
1179          * the rw_shared_count will drop to 0 first and then want_upgrade
1180          * will be set to 1 in the shared_to_exclusive scenario... those
1181          * adjustments are done behind the interlock and represent an
1182          * atomic change in state and must be considered as such
1183          * however, once we see the read count at 0, the want_upgrade not set
1184          * and the interlock not held, we are safe to proceed
1185          */
1186         while (lck_rw_held_read_or_upgrade(lck)) {
1187 #if     CONFIG_DTRACE
1188                 /*
1189                  * Either sleeping or spinning is happening, start
1190                  * a timing of our delay interval now.  If we set it
1191                  * to -1 we don't have accurate data so we cannot later
1192                  * decide to record a dtrace spin or sleep event.
1193                  */
1194                 if (dtrace_ls_initialized == FALSE) {
1195                         dtrace_ls_initialized = TRUE;
1196                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1197                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1198                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1199                         if (dtrace_ls_enabled) {
1200                                 /*
1201                                  * Either sleeping or spinning is happening,
1202                                  *  start a timing of our delay interval now.
1203                                  */
1204                                 readers_at_sleep = lck->lck_rw_shared_count;
1205                                 wait_interval = mach_absolute_time();
1206                         }
1207                 }
1208 #endif
1209                 if (istate == -1) {
1210                         istate = ml_get_interrupts_enabled();
1211                 }
1212
1213                 deadline = lck_rw_deadline_for_spin(lck);
1214
1215                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1216
1217                 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) {
1218                         lck_rw_lock_pause(istate);
1219                 }
1220
1221                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1222
1223                 if (!lockheld) {
1224                         break;
1225                 }
1226                 /*
1227                  * if we get here, the deadline has expired w/o us
1228                  * being able to grab the lock exclusively
1229                  * check to see if we're allowed to do a thread_block
1230                  */
1231                 if (lck->lck_rw_can_sleep) {
1232                         istate = lck_interlock_lock(lck);
1233
1234                         if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1235                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1236
1237                                 lck->lck_w_waiting = TRUE;
1238
1239                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1240                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1241                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1242                                 lck_interlock_unlock(lck, istate);
1243
1244                                 if (res == THREAD_WAITING) {
1245                                         res = thread_block(THREAD_CONTINUE_NULL);
1246                                         slept++;
1247                                 }
1248                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1249                         } else {
1250                                 lck_interlock_unlock(lck, istate);
1251                                 /*
1252                                  * must own the lock now, since we checked for
1253                                  * readers or upgrade owner behind the interlock
1254                                  * no need for a call to 'lck_rw_held_read_or_upgrade'
1255                                  */
1256                                 break;
1257                         }
1258                 }
1259         }
1260
1261 #if     CONFIG_DTRACE
1262         /*
1263          * Decide what latencies we suffered that are Dtrace events.
1264          * If we have set wait_interval, then we either spun or slept.
1265          * At least we get out from under the interlock before we record
1266          * which is the best we can do here to minimize the impact
1267          * of the tracing.
1268          * If we have set wait_interval to -1, then dtrace was not enabled when we
1269          * started sleeping/spinning so we don't record this event.
1270          */
1271         if (dtrace_ls_enabled == TRUE) {
1272                 if (slept == 0) {
1273                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1274                             mach_absolute_time() - wait_interval, 1);
1275                 } else {
1276                         /*
1277                          * For the blocking case, we also record if when we blocked
1278                          * it was held for read or write, and how many readers.
1279                          * Notice that above we recorded this before we dropped
1280                          * the interlock so the count is accurate.
1281                          */
1282                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1283                             mach_absolute_time() - wait_interval, 1,
1284                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1285                 }
1286         }
1287         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1288 #endif
1289 }
1290
1291 /*
1292  *      Routine:        lck_rw_done
1293  */
1294
1295 lck_rw_type_t
1296 lck_rw_done(lck_rw_t *lock)
1297 {
1298         uint32_t        data, prev;
1299
1300         for (;;) {
1301                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1302                 if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
1303                         atomic_exchange_abort();
1304                         lck_rw_interlock_spin(lock);
1305                         continue;
1306                 }
1307                 if (data & LCK_RW_SHARED_MASK) {
1308                         data -= LCK_RW_SHARED_READER;
1309                         if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
1310                                 goto check_waiters;
1311                         }
1312                 } else {                                        /* if reader count == 0, must be exclusive lock */
1313                         if (data & LCK_RW_WANT_UPGRADE) {
1314                                 data &= ~(LCK_RW_WANT_UPGRADE);
1315                         } else {
1316                                 if (data & LCK_RW_WANT_WRITE) {
1317                                         data &= ~(LCK_RW_WANT_EXCL);
1318                                 } else {                                /* lock is not 'owned', panic */
1319                                         panic("Releasing non-exclusive RW lock without a reader refcount!");
1320                                 }
1321                         }
1322 check_waiters:
1323                         if (prev & LCK_RW_W_WAITING) {
1324                                 data &= ~(LCK_RW_W_WAITING);
1325                                 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
1326                                         data &= ~(LCK_RW_R_WAITING);
1327                                 }
1328                         } else {
1329                                 data &= ~(LCK_RW_R_WAITING);
1330                         }
1331                 }
1332                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
1333                         break;
1334                 }
1335                 cpu_pause();
1336         }
1337         return lck_rw_done_gen(lock, prev);
1338 }
1339
1340 /*
1341  *      Routine:        lck_rw_done_gen
1342  *
1343  *      called from lck_rw_done()
1344  *      prior_lock_state is the value in the 1st
1345  *      word of the lock at the time of a successful
1346  *      atomic compare and exchange with the new value...
1347  *      it represents the state of the lock before we
1348  *      decremented the rw_shared_count or cleared either
1349  *      rw_want_upgrade or rw_want_write and
1350  *      the lck_x_waiting bits...  since the wrapper
1351  *      routine has already changed the state atomically,
1352  *      we just need to decide if we should
1353  *      wake up anyone and what value to return... we do
1354  *      this by examining the state of the lock before
1355  *      we changed it
1356  */
1357 static lck_rw_type_t
1358 lck_rw_done_gen(
1359         lck_rw_t        *lck,
1360         uint32_t        prior_lock_state)
1361 {
1362         lck_rw_t        *fake_lck;
1363         lck_rw_type_t   lock_type;
1364         thread_t        thread;
1365         uint32_t        rwlock_count;
1366
1367         thread = current_thread();
1368         rwlock_count = thread->rwlock_count--;
1369         fake_lck = (lck_rw_t *)&prior_lock_state;
1370
1371         if (lck->lck_rw_can_sleep) {
1372                 /*
1373                  * prior_lock state is a snapshot of the 1st word of the
1374                  * lock in question... we'll fake up a pointer to it
1375                  * and carefully not access anything beyond whats defined
1376                  * in the first word of a lck_rw_t
1377                  */
1378
1379                 if (fake_lck->lck_rw_shared_count <= 1) {
1380                         if (fake_lck->lck_w_waiting) {
1381                                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1382                         }
1383
1384                         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1385                                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1386                         }
1387                 }
1388 #if MACH_LDEBUG
1389                 if (rwlock_count == 0) {
1390                         panic("rw lock count underflow for thread %p", thread);
1391                 }
1392 #endif
1393                 /* Check if dropping the lock means that we need to unpromote */
1394
1395                 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1396                         /* sched_flags checked without lock, but will be rechecked while clearing */
1397                         lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1398                 }
1399         }
1400         if (fake_lck->lck_rw_shared_count) {
1401                 lock_type = LCK_RW_TYPE_SHARED;
1402         } else {
1403                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1404         }
1405
1406 #if CONFIG_DTRACE
1407         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1408 #endif
1409
1410         return lock_type;
1411 }
1412
1413
1414 /*
1415  *      Routine:        lck_rw_unlock
1416  */
1417 void
1418 lck_rw_unlock(
1419         lck_rw_t        *lck,
1420         lck_rw_type_t   lck_rw_type)
1421 {
1422         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1423                 lck_rw_unlock_shared(lck);
1424         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1425                 lck_rw_unlock_exclusive(lck);
1426         } else {
1427                 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1428         }
1429 }
1430
1431
1432 /*
1433  *      Routine:        lck_rw_unlock_shared
1434  */
1435 void
1436 lck_rw_unlock_shared(
1437         lck_rw_t        *lck)
1438 {
1439         lck_rw_type_t   ret;
1440
1441         assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1442         ret = lck_rw_done(lck);
1443
1444         if (ret != LCK_RW_TYPE_SHARED) {
1445                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1446         }
1447 }
1448
1449
1450 /*
1451  *      Routine:        lck_rw_unlock_exclusive
1452  */
1453 void
1454 lck_rw_unlock_exclusive(
1455         lck_rw_t        *lck)
1456 {
1457         lck_rw_type_t   ret;
1458
1459         ret = lck_rw_done(lck);
1460
1461         if (ret != LCK_RW_TYPE_EXCLUSIVE) {
1462                 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1463         }
1464 }
1465
1466
1467 /*
1468  *      Routine:        lck_rw_lock
1469  */
1470 void
1471 lck_rw_lock(
1472         lck_rw_t        *lck,
1473         lck_rw_type_t   lck_rw_type)
1474 {
1475         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1476                 lck_rw_lock_shared(lck);
1477         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1478                 lck_rw_lock_exclusive(lck);
1479         } else {
1480                 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1481         }
1482 }
1483
1484 /*
1485  *      Routine:        lck_rw_lock_shared
1486  */
1487 void
1488 lck_rw_lock_shared(lck_rw_t *lock)
1489 {
1490         uint32_t        data, prev;
1491
1492         current_thread()->rwlock_count++;
1493         for (;;) {
1494                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1495                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1496                         atomic_exchange_abort();
1497                         if (lock->lck_rw_can_sleep) {
1498                                 lck_rw_lock_shared_gen(lock);
1499                         } else {
1500                                 cpu_pause();
1501                                 continue;
1502                         }
1503                         break;
1504                 }
1505                 data += LCK_RW_SHARED_READER;
1506                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1507                         break;
1508                 }
1509                 cpu_pause();
1510         }
1511 #if     CONFIG_DTRACE
1512         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1513 #endif  /* CONFIG_DTRACE */
1514         return;
1515 }
1516
1517 /*
1518  *      Routine:        lck_rw_lock_shared_gen
1519  *      Function:
1520  *              assembly fast path code has determined that this lock
1521  *              is held exclusively... this is where we spin/block
1522  *              until we can acquire the lock in the shared mode
1523  */
1524 static void
1525 lck_rw_lock_shared_gen(
1526         lck_rw_t        *lck)
1527 {
1528         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1529         uint64_t        deadline = 0;
1530         int             gotlock = 0;
1531         int             slept = 0;
1532         wait_result_t   res = 0;
1533         boolean_t       istate = -1;
1534
1535 #if     CONFIG_DTRACE
1536         uint64_t wait_interval = 0;
1537         int readers_at_sleep = 0;
1538         boolean_t dtrace_ls_initialized = FALSE;
1539         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1540 #endif
1541
1542         while (!lck_rw_grab_shared(lck)) {
1543 #if     CONFIG_DTRACE
1544                 if (dtrace_ls_initialized == FALSE) {
1545                         dtrace_ls_initialized = TRUE;
1546                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1547                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1548                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1549                         if (dtrace_ls_enabled) {
1550                                 /*
1551                                  * Either sleeping or spinning is happening,
1552                                  *  start a timing of our delay interval now.
1553                                  */
1554                                 readers_at_sleep = lck->lck_rw_shared_count;
1555                                 wait_interval = mach_absolute_time();
1556                         }
1557                 }
1558 #endif
1559                 if (istate == -1) {
1560                         istate = ml_get_interrupts_enabled();
1561                 }
1562
1563                 deadline = lck_rw_deadline_for_spin(lck);
1564
1565                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1566                     trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1567
1568                 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) {
1569                         lck_rw_lock_pause(istate);
1570                 }
1571
1572                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1573                     trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1574
1575                 if (gotlock) {
1576                         break;
1577                 }
1578                 /*
1579                  * if we get here, the deadline has expired w/o us
1580                  * being able to grab the lock for read
1581                  * check to see if we're allowed to do a thread_block
1582                  */
1583                 if (lck->lck_rw_can_sleep) {
1584                         istate = lck_interlock_lock(lck);
1585
1586                         if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1587                             ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1588                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1589                                     trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1590
1591                                 lck->lck_r_waiting = TRUE;
1592
1593                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1594                                 res = assert_wait(RW_LOCK_READER_EVENT(lck),
1595                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1596                                 lck_interlock_unlock(lck, istate);
1597
1598                                 if (res == THREAD_WAITING) {
1599                                         res = thread_block(THREAD_CONTINUE_NULL);
1600                                         slept++;
1601                                 }
1602                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1603                                     trace_lck, res, slept, 0, 0);
1604                         } else {
1605                                 lck->lck_rw_shared_count++;
1606                                 lck_interlock_unlock(lck, istate);
1607                                 break;
1608                         }
1609                 }
1610         }
1611
1612 #if     CONFIG_DTRACE
1613         if (dtrace_ls_enabled == TRUE) {
1614                 if (slept == 0) {
1615                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1616                 } else {
1617                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1618                             mach_absolute_time() - wait_interval, 0,
1619                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1620                 }
1621         }
1622         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1623 #endif
1624 }
1625
1626
1627 /*
1628  *      Routine:        lck_rw_lock_exclusive
1629  */
1630
1631 void
1632 lck_rw_lock_exclusive(lck_rw_t *lock)
1633 {
1634         current_thread()->rwlock_count++;
1635         if (atomic_test_and_set32(&lock->data,
1636             (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1637             LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1638 #if     CONFIG_DTRACE
1639                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1640 #endif  /* CONFIG_DTRACE */
1641         } else {
1642                 lck_rw_lock_exclusive_gen(lock);
1643         }
1644 }
1645
1646
1647 /*
1648  *      Routine:        lck_rw_lock_shared_to_exclusive
1649  *
1650  *      False returned upon failure, in this case the shared lock is dropped.
1651  */
1652
1653 boolean_t
1654 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1655 {
1656         uint32_t        data, prev;
1657
1658         for (;;) {
1659                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1660                 if (data & LCK_RW_INTERLOCK) {
1661                         atomic_exchange_abort();
1662                         lck_rw_interlock_spin(lock);
1663                         continue;
1664                 }
1665                 if (data & LCK_RW_WANT_UPGRADE) {
1666                         data -= LCK_RW_SHARED_READER;
1667                         if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1668                                 data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1669                         }
1670                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1671                                 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1672                         }
1673                 } else {
1674                         data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1675                         data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1676                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1677                                 break;
1678                         }
1679                 }
1680                 cpu_pause();
1681         }
1682         /* we now own the WANT_UPGRADE */
1683         if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1684                 lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1685         }
1686 #if     CONFIG_DTRACE
1687         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1688 #endif
1689         return TRUE;
1690 }
1691
1692
1693 /*
1694  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1695  *      Function:
1696  *              assembly fast path code has already dropped our read
1697  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1698  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1699  *              all we need to do here is determine if a wakeup is needed
1700  */
1701 static boolean_t
1702 lck_rw_lock_shared_to_exclusive_failure(
1703         lck_rw_t        *lck,
1704         uint32_t        prior_lock_state)
1705 {
1706         lck_rw_t        *fake_lck;
1707         thread_t        thread = current_thread();
1708         uint32_t        rwlock_count;
1709
1710         /* Check if dropping the lock means that we need to unpromote */
1711         rwlock_count = thread->rwlock_count--;
1712 #if MACH_LDEBUG
1713         if (rwlock_count == 0) {
1714                 panic("rw lock count underflow for thread %p", thread);
1715         }
1716 #endif
1717         fake_lck = (lck_rw_t *)&prior_lock_state;
1718
1719         if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1720                 /*
1721                  *      Someone else has requested upgrade.
1722                  *      Since we've released the read lock, wake
1723                  *      him up if he's blocked waiting
1724                  */
1725                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1726         }
1727
1728         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1729                 /* sched_flags checked without lock, but will be rechecked while clearing */
1730                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1731         }
1732
1733         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1734             VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1735
1736         return FALSE;
1737 }
1738
1739
1740 /*
1741  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1742  *      Function:
1743  *              assembly fast path code has already dropped our read
1744  *              count and successfully acquired 'lck_rw_want_upgrade'
1745  *              we just need to wait for the rest of the readers to drain
1746  *              and then we can return as the exclusive holder of this lock
1747  */
1748 static boolean_t
1749 lck_rw_lock_shared_to_exclusive_success(
1750         lck_rw_t        *lck)
1751 {
1752         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1753         uint64_t        deadline = 0;
1754         int             slept = 0;
1755         int             still_shared = 0;
1756         wait_result_t   res;
1757         boolean_t       istate = -1;
1758
1759 #if     CONFIG_DTRACE
1760         uint64_t wait_interval = 0;
1761         int readers_at_sleep = 0;
1762         boolean_t dtrace_ls_initialized = FALSE;
1763         boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1764 #endif
1765
1766         while (lck->lck_rw_shared_count != 0) {
1767 #if     CONFIG_DTRACE
1768                 if (dtrace_ls_initialized == FALSE) {
1769                         dtrace_ls_initialized = TRUE;
1770                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1771                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1772                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1773                         if (dtrace_ls_enabled) {
1774                                 /*
1775                                  * Either sleeping or spinning is happening,
1776                                  *  start a timing of our delay interval now.
1777                                  */
1778                                 readers_at_sleep = lck->lck_rw_shared_count;
1779                                 wait_interval = mach_absolute_time();
1780                         }
1781                 }
1782 #endif
1783                 if (istate == -1) {
1784                         istate = ml_get_interrupts_enabled();
1785                 }
1786
1787                 deadline = lck_rw_deadline_for_spin(lck);
1788
1789                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1790                     trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1791
1792                 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) {
1793                         lck_rw_lock_pause(istate);
1794                 }
1795
1796                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1797                     trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1798
1799                 if (!still_shared) {
1800                         break;
1801                 }
1802                 /*
1803                  * if we get here, the deadline has expired w/o
1804                  * the rw_shared_count having drained to 0
1805                  * check to see if we're allowed to do a thread_block
1806                  */
1807                 if (lck->lck_rw_can_sleep) {
1808                         istate = lck_interlock_lock(lck);
1809
1810                         if (lck->lck_rw_shared_count != 0) {
1811                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1812                                     trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1813
1814                                 lck->lck_w_waiting = TRUE;
1815
1816                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1817                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1818                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1819                                 lck_interlock_unlock(lck, istate);
1820
1821                                 if (res == THREAD_WAITING) {
1822                                         res = thread_block(THREAD_CONTINUE_NULL);
1823                                         slept++;
1824                                 }
1825                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1826                                     trace_lck, res, slept, 0, 0);
1827                         } else {
1828                                 lck_interlock_unlock(lck, istate);
1829                                 break;
1830                         }
1831                 }
1832         }
1833 #if     CONFIG_DTRACE
1834         /*
1835          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1836          */
1837         if (dtrace_ls_enabled == TRUE) {
1838                 if (slept == 0) {
1839                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1840                 } else {
1841                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1842                             mach_absolute_time() - wait_interval, 1,
1843                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1844                 }
1845         }
1846         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1847 #endif
1848         return TRUE;
1849 }
1850
1851 /*
1852  *      Routine:        lck_rw_lock_exclusive_to_shared
1853  */
1854
1855 void
1856 lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1857 {
1858         uint32_t        data, prev;
1859
1860         for (;;) {
1861                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1862                 if (data & LCK_RW_INTERLOCK) {
1863                         atomic_exchange_abort();
1864                         lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1865                         continue;
1866                 }
1867                 data += LCK_RW_SHARED_READER;
1868                 if (data & LCK_RW_WANT_UPGRADE) {
1869                         data &= ~(LCK_RW_WANT_UPGRADE);
1870                 } else {
1871                         data &= ~(LCK_RW_WANT_EXCL);
1872                 }
1873                 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1874                         data &= ~(LCK_RW_W_WAITING);
1875                 }
1876                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
1877                         break;
1878                 }
1879                 cpu_pause();
1880         }
1881         return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1882 }
1883
1884
1885 /*
1886  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1887  *      Function:
1888  *              assembly fast path has already dropped
1889  *              our exclusive state and bumped lck_rw_shared_count
1890  *              all we need to do here is determine if anyone
1891  *              needs to be awakened.
1892  */
1893 static void
1894 lck_rw_lock_exclusive_to_shared_gen(
1895         lck_rw_t        *lck,
1896         uint32_t        prior_lock_state)
1897 {
1898         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1899         lck_rw_t                *fake_lck;
1900
1901         fake_lck = (lck_rw_t *)&prior_lock_state;
1902
1903         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1904             trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1905
1906         /*
1907          * don't wake up anyone waiting to take the lock exclusively
1908          * since we hold a read count... when the read count drops to 0,
1909          * the writers will be woken.
1910          *
1911          * wake up any waiting readers if we don't have any writers waiting,
1912          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1913          */
1914         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1915                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1916         }
1917
1918         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1919             trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1920
1921 #if CONFIG_DTRACE
1922         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1923 #endif
1924 }
1925
1926
1927 /*
1928  *      Routine:        lck_rw_try_lock
1929  */
1930 boolean_t
1931 lck_rw_try_lock(
1932         lck_rw_t        *lck,
1933         lck_rw_type_t   lck_rw_type)
1934 {
1935         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1936                 return lck_rw_try_lock_shared(lck);
1937         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1938                 return lck_rw_try_lock_exclusive(lck);
1939         } else {
1940                 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1941         }
1942         return FALSE;
1943 }
1944
1945 /*
1946  *      Routine:        lck_rw_try_lock_shared
1947  */
1948
1949 boolean_t
1950 lck_rw_try_lock_shared(lck_rw_t *lock)
1951 {
1952         uint32_t        data, prev;
1953
1954         for (;;) {
1955                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1956                 if (data & LCK_RW_INTERLOCK) {
1957                         atomic_exchange_abort();
1958                         lck_rw_interlock_spin(lock);
1959                         continue;
1960                 }
1961                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1962                         atomic_exchange_abort();
1963                         return FALSE;                   /* lock is busy */
1964                 }
1965                 data += LCK_RW_SHARED_READER;           /* Increment reader refcount */
1966                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1967                         break;
1968                 }
1969                 cpu_pause();
1970         }
1971         current_thread()->rwlock_count++;
1972         /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1973 #if     CONFIG_DTRACE
1974         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1975 #endif  /* CONFIG_DTRACE */
1976         return TRUE;
1977 }
1978
1979
1980 /*
1981  *      Routine:        lck_rw_try_lock_exclusive
1982  */
1983
1984 boolean_t
1985 lck_rw_try_lock_exclusive(lck_rw_t *lock)
1986 {
1987         uint32_t        data, prev;
1988
1989         for (;;) {
1990                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1991                 if (data & LCK_RW_INTERLOCK) {
1992                         atomic_exchange_abort();
1993                         lck_rw_interlock_spin(lock);
1994                         continue;
1995                 }
1996                 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1997                         atomic_exchange_abort();
1998                         return FALSE;                           /* can't get it */
1999                 }
2000                 data |= LCK_RW_WANT_EXCL;
2001                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
2002                         break;
2003                 }
2004                 cpu_pause();
2005         }
2006
2007         current_thread()->rwlock_count++;
2008 #if     CONFIG_DTRACE
2009         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2010 #endif  /* CONFIG_DTRACE */
2011         return TRUE;
2012 }
2013
2014
2015 void
2016 lck_rw_assert(
2017         lck_rw_t        *lck,
2018         unsigned int    type)
2019 {
2020         switch (type) {
2021         case LCK_RW_ASSERT_SHARED:
2022                 if (lck->lck_rw_shared_count != 0) {
2023                         return;
2024                 }
2025                 break;
2026         case LCK_RW_ASSERT_EXCLUSIVE:
2027                 if ((lck->lck_rw_want_write ||
2028                     lck->lck_rw_want_upgrade) &&
2029                     lck->lck_rw_shared_count == 0) {
2030                         return;
2031                 }
2032                 break;
2033         case LCK_RW_ASSERT_HELD:
2034                 if (lck->lck_rw_want_write ||
2035                     lck->lck_rw_want_upgrade ||
2036                     lck->lck_rw_shared_count != 0) {
2037                         return;
2038                 }
2039                 break;
2040         case LCK_RW_ASSERT_NOTHELD:
2041                 if (!(lck->lck_rw_want_write ||
2042                     lck->lck_rw_want_upgrade ||
2043                     lck->lck_rw_shared_count != 0)) {
2044                         return;
2045                 }
2046                 break;
2047         default:
2048                 break;
2049         }
2050
2051         panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
2052 }
2053
2054 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
2055 #if MACH_LDEBUG
2056 __dead2
2057 #endif
2058 void
2059 lck_rw_clear_promotions_x86(thread_t thread)
2060 {
2061 #if MACH_LDEBUG
2062         /* It's fatal to leave a RW lock locked and return to userspace */
2063         panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
2064 #else
2065         /* Paper over the issue */
2066         thread->rwlock_count = 0;
2067         lck_rw_clear_promotion(thread, 0);
2068 #endif
2069 }
2070
2071 boolean_t
2072 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2073 {
2074         lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2075
2076         if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
2077                 lck_rw_unlock_shared(lck);
2078                 mutex_pause(2);
2079                 lck_rw_lock_shared(lck);
2080                 return TRUE;
2081         }
2082
2083         return FALSE;
2084 }
2085
2086 /*
2087  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2088  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2089  */
2090 boolean_t
2091 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2092 {
2093         if (not_in_kdp) {
2094                 panic("panic: rw lock exclusive check done outside of kernel debugger");
2095         }
2096         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2097 }
2098
2099 /*
2100  * Slow path routines for lck_mtx locking and unlocking functions.
2101  *
2102  * These functions were previously implemented in x86 assembly,
2103  * and some optimizations are in place in this c code to obtain a compiled code
2104  * as performant and compact as the assembly version.
2105  *
2106  * To avoid to inline these functions on the fast path, all functions directly called by
2107  * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2108  * in such a way the fast path can tail call into them. In this way the return address
2109  * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2110  *
2111  * Slow path code is structured in such a way there are no calls to functions that will return
2112  * on the context of the caller function, i.e. all functions called are or tail call functions
2113  * or inline functions. The number of arguments of the tail call functions are less then six,
2114  * so that they can be passed over registers and do not need to be pushed on stack.
2115  * This allows the compiler to not create a stack frame for the functions.
2116  *
2117  * __improbable and __probable are used to compile the slow path code in such a way
2118  * the fast path case will be on a sequence of instructions with as less jumps as possible,
2119  * to make this case the most optimized even if falling through the slow path.
2120  */
2121
2122 /*
2123  * Intel lock invariants:
2124  *
2125  * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2126  *
2127  * The lock owner is promoted to the max priority of all its waiters only if it
2128  * was a lower priority when it acquired or was an owner when a waiter waited.
2129  * Max priority is capped at MAXPRI_PROMOTE.
2130  *
2131  * The last waiter will not be promoted as it is woken up, but the last
2132  * lock owner may not have been the last thread to have been woken up depending on the
2133  * luck of the draw.  Therefore a last-owner may still have the promoted-on-wakeup
2134  * flag set.
2135  *
2136  * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2137  *       priority from dropping priority in the future without having to take thread lock
2138  *       on acquire.
2139  */
2140
2141 #ifdef  MUTEX_ZONE
2142 extern zone_t lck_mtx_zone;
2143 #endif
2144
2145 /*
2146  *      Routine:        lck_mtx_alloc_init
2147  */
2148 lck_mtx_t *
2149 lck_mtx_alloc_init(
2150         lck_grp_t       *grp,
2151         lck_attr_t      *attr)
2152 {
2153         lck_mtx_t       *lck;
2154 #ifdef  MUTEX_ZONE
2155         if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) {
2156                 lck_mtx_init(lck, grp, attr);
2157         }
2158 #else
2159         if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) {
2160                 lck_mtx_init(lck, grp, attr);
2161         }
2162 #endif
2163         return lck;
2164 }
2165
2166 /*
2167  *      Routine:        lck_mtx_free
2168  */
2169 void
2170 lck_mtx_free(
2171         lck_mtx_t       *lck,
2172         lck_grp_t       *grp)
2173 {
2174         lck_mtx_destroy(lck, grp);
2175 #ifdef  MUTEX_ZONE
2176         zfree(lck_mtx_zone, lck);
2177 #else
2178         kfree(lck, sizeof(lck_mtx_t));
2179 #endif
2180 }
2181
2182 /*
2183  *      Routine:        lck_mtx_ext_init
2184  */
2185 static void
2186 lck_mtx_ext_init(
2187         lck_mtx_ext_t   *lck,
2188         lck_grp_t       *grp,
2189         lck_attr_t      *attr)
2190 {
2191         bzero((void *)lck, sizeof(lck_mtx_ext_t));
2192
2193         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2194                 lck->lck_mtx_deb.type = MUTEX_TAG;
2195                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2196         }
2197
2198         lck->lck_mtx_grp = grp;
2199
2200         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
2201                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2202         }
2203
2204         lck->lck_mtx.lck_mtx_is_ext = 1;
2205         lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
2206 }
2207
2208 /*
2209  *      Routine:        lck_mtx_init
2210  */
2211 void
2212 lck_mtx_init(
2213         lck_mtx_t       *lck,
2214         lck_grp_t       *grp,
2215         lck_attr_t      *attr)
2216 {
2217         lck_mtx_ext_t   *lck_ext;
2218         lck_attr_t      *lck_attr;
2219
2220         if (attr != LCK_ATTR_NULL) {
2221                 lck_attr = attr;
2222         } else {
2223                 lck_attr = &LockDefaultLckAttr;
2224         }
2225
2226         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2227                 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2228                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
2229                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2230                         lck->lck_mtx_ptr = lck_ext;
2231                 }
2232         } else {
2233                 lck->lck_mtx_owner = 0;
2234                 lck->lck_mtx_state = 0;
2235         }
2236         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2237         lck_grp_reference(grp);
2238         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2239 }
2240
2241 /*
2242  *      Routine:        lck_mtx_init_ext
2243  */
2244 void
2245 lck_mtx_init_ext(
2246         lck_mtx_t       *lck,
2247         lck_mtx_ext_t   *lck_ext,
2248         lck_grp_t       *grp,
2249         lck_attr_t      *attr)
2250 {
2251         lck_attr_t      *lck_attr;
2252
2253         if (attr != LCK_ATTR_NULL) {
2254                 lck_attr = attr;
2255         } else {
2256                 lck_attr = &LockDefaultLckAttr;
2257         }
2258
2259         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2260                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2261                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2262                 lck->lck_mtx_ptr = lck_ext;
2263         } else {
2264                 lck->lck_mtx_owner = 0;
2265                 lck->lck_mtx_state = 0;
2266         }
2267         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2268
2269         lck_grp_reference(grp);
2270         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2271 }
2272
2273 static void
2274 lck_mtx_lock_mark_destroyed(
2275         lck_mtx_t *mutex,
2276         boolean_t indirect)
2277 {
2278         uint32_t state;
2279
2280         if (indirect) {
2281                 /* convert to destroyed state */
2282                 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2283                 return;
2284         }
2285
2286         state = ordered_load_mtx_state(mutex);
2287         lck_mtx_interlock_lock(mutex, &state);
2288
2289         ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2290
2291         enable_preemption();
2292 }
2293
2294 /*
2295  *      Routine:        lck_mtx_destroy
2296  */
2297 void
2298 lck_mtx_destroy(
2299         lck_mtx_t       *lck,
2300         lck_grp_t       *grp)
2301 {
2302         boolean_t indirect;
2303
2304         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2305                 return;
2306         }
2307 #if MACH_LDEBUG
2308         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2309 #endif
2310         indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2311
2312         lck_mtx_lock_mark_destroyed(lck, indirect);
2313
2314         if (indirect) {
2315                 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2316         }
2317         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2318         lck_grp_deallocate(grp);
2319         return;
2320 }
2321
2322
2323 #if DEVELOPMENT | DEBUG
2324 __attribute__((noinline))
2325 void
2326 lck_mtx_owner_check_panic(
2327         lck_mtx_t       *lock)
2328 {
2329         thread_t owner = (thread_t)lock->lck_mtx_owner;
2330         panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2331 }
2332 #endif
2333
2334 __attribute__((always_inline))
2335 static boolean_t
2336 get_indirect_mutex(
2337         lck_mtx_t       **lock,
2338         uint32_t        *state)
2339 {
2340         *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2341         *state = ordered_load_mtx_state(*lock);
2342         return TRUE;
2343 }
2344
2345 /*
2346  * Routine:     lck_mtx_unlock_slow
2347  *
2348  * Unlocks a mutex held by current thread.
2349  *
2350  * It will wake up waiters if necessary.
2351  *
2352  * Interlock can be held.
2353  */
2354 __attribute__((noinline))
2355 void
2356 lck_mtx_unlock_slow(
2357         lck_mtx_t       *lock)
2358 {
2359         thread_t        thread;
2360         uint32_t        state, prev;
2361         boolean_t       indirect = FALSE;
2362
2363         state = ordered_load_mtx_state(lock);
2364
2365         /* Is this an indirect mutex? */
2366         if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2367                 indirect = get_indirect_mutex(&lock, &state);
2368         }
2369
2370         thread = current_thread();
2371
2372 #if DEVELOPMENT | DEBUG
2373         thread_t owner = (thread_t)lock->lck_mtx_owner;
2374         if (__improbable(owner != thread)) {
2375                 lck_mtx_owner_check_panic(lock);
2376         }
2377 #endif
2378
2379         /* check if it is held as a spinlock */
2380         if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
2381                 goto unlock;
2382         }
2383
2384         lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2385
2386 unlock:
2387         /* preemption disabled, interlock held and mutex not held */
2388
2389         /* clear owner */
2390         ordered_store_mtx_owner(lock, 0);
2391         /* keep original state in prev for later evaluation */
2392         prev = state;
2393
2394         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2395 #if     MACH_LDEBUG
2396                 if (thread) {
2397                         thread->mutex_count--;
2398                 }
2399 #endif
2400                 return lck_mtx_unlock_wakeup_tail(lock, state, indirect);
2401         }
2402
2403         /* release interlock, promotion and clear spin flag */
2404         state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
2405         ordered_store_mtx_state_release(lock, state);           /* since I own the interlock, I don't need an atomic update */
2406
2407 #if     MACH_LDEBUG
2408         /* perform lock statistics after drop to prevent delay */
2409         if (thread) {
2410                 thread->mutex_count--;          /* lock statistic */
2411         }
2412 #endif  /* MACH_LDEBUG */
2413
2414         /* re-enable preemption */
2415         lck_mtx_unlock_finish_inline(lock, FALSE);
2416
2417         return;
2418 }
2419
2420 #define LCK_MTX_LCK_WAIT_CODE           0x20
2421 #define LCK_MTX_LCK_WAKEUP_CODE         0x21
2422 #define LCK_MTX_LCK_SPIN_CODE           0x22
2423 #define LCK_MTX_LCK_ACQUIRE_CODE        0x23
2424 #define LCK_MTX_LCK_DEMOTE_CODE         0x24
2425
2426 /*
2427  * Routine:    lck_mtx_unlock_wakeup_tail
2428  *
2429  * Invoked on unlock when there is
2430  * contention, i.e. the assembly routine sees
2431  * that mutex->lck_mtx_waiters != 0
2432  *
2433  * neither the mutex or interlock is held
2434  *
2435  * Note that this routine might not be called if there are pending
2436  * waiters which have previously been woken up, and they didn't
2437  * end up boosting the old owner.
2438  *
2439  * assembly routine previously did the following to mutex:
2440  * (after saving the state in prior_lock_state)
2441  *      decremented lck_mtx_waiters if nonzero
2442  *
2443  * This function needs to be called as a tail call
2444  * to optimize the compiled code.
2445  */
2446 __attribute__((noinline))
2447 static void
2448 lck_mtx_unlock_wakeup_tail(
2449         lck_mtx_t       *mutex,
2450         uint32_t        state,
2451         boolean_t       indirect)
2452 {
2453         struct turnstile *ts;
2454
2455         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2456         kern_return_t did_wake;
2457
2458         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
2459             trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2460
2461         ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
2462
2463         if (mutex->lck_mtx_waiters > 1) {
2464                 /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
2465                 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
2466         } else {
2467                 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2468                 turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE);
2469         }
2470         assert(did_wake == KERN_SUCCESS);
2471
2472         turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2473         turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2474
2475         state -= LCK_MTX_WAITER;
2476         state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
2477         ordered_store_mtx_state_release(mutex, state);
2478
2479         assert(current_thread()->turnstile != NULL);
2480
2481         turnstile_cleanup();
2482
2483         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
2484             trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2485
2486         lck_mtx_unlock_finish_inline(mutex, indirect);
2487 }
2488
2489 /*
2490  * Routine:     lck_mtx_lock_acquire_x86
2491  *
2492  * Invoked on acquiring the mutex when there is
2493  * contention (i.e. the assembly routine sees that
2494  * that mutex->lck_mtx_waiters != 0
2495  *
2496  * mutex is owned...  interlock is held... preemption is disabled
2497  */
2498 __attribute__((always_inline))
2499 static void
2500 lck_mtx_lock_acquire_inline(
2501         lck_mtx_t       *mutex,
2502         struct turnstile *ts)
2503 {
2504         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2505
2506         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
2507             trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2508
2509         thread_t thread = (thread_t)mutex->lck_mtx_owner;       /* faster than current_thread() */
2510         assert(thread->waiting_for_mutex == NULL);
2511
2512         if (mutex->lck_mtx_waiters > 0) {
2513                 if (ts == NULL) {
2514                         ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
2515                 }
2516
2517                 turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
2518                 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2519         }
2520
2521         if (ts != NULL) {
2522                 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2523         }
2524
2525         assert(current_thread()->turnstile != NULL);
2526
2527         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
2528             trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2529 }
2530
2531 void
2532 lck_mtx_lock_acquire_x86(
2533         lck_mtx_t       *mutex)
2534 {
2535         return lck_mtx_lock_acquire_inline(mutex, NULL);
2536 }
2537
2538 /*
2539  * Tail call helpers for lock functions that perform
2540  * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2541  * the caller's compiled code.
2542  */
2543
2544 __attribute__((noinline))
2545 static void
2546 lck_mtx_lock_acquire_tail(
2547         lck_mtx_t       *mutex,
2548         boolean_t       indirect,
2549         struct turnstile *ts)
2550 {
2551         lck_mtx_lock_acquire_inline(mutex, ts);
2552         lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex), indirect);
2553 }
2554
2555 __attribute__((noinline))
2556 static boolean_t
2557 lck_mtx_try_lock_acquire_tail(
2558         lck_mtx_t       *mutex)
2559 {
2560         lck_mtx_lock_acquire_inline(mutex, NULL);
2561         lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2562
2563         return TRUE;
2564 }
2565
2566 __attribute__((noinline))
2567 static void
2568 lck_mtx_convert_spin_acquire_tail(
2569         lck_mtx_t       *mutex)
2570 {
2571         lck_mtx_lock_acquire_inline(mutex, NULL);
2572         lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2573 }
2574
2575 boolean_t
2576 lck_mtx_ilk_unlock(
2577         lck_mtx_t       *mutex)
2578 {
2579         lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2580         return TRUE;
2581 }
2582
2583 static inline void
2584 lck_mtx_interlock_lock_set_and_clear_flags(
2585         lck_mtx_t *mutex,
2586         uint32_t xor_flags,
2587         uint32_t and_flags,
2588         uint32_t *new_state)
2589 {
2590         uint32_t state, prev;
2591         state = *new_state;
2592
2593         for (;;) {
2594                 /* have to wait for interlock to clear */
2595                 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2596                         cpu_pause();
2597                         state = ordered_load_mtx_state(mutex);
2598                 }
2599                 prev = state;                                   /* prev contains snapshot for exchange */
2600                 state |= LCK_MTX_ILOCKED_MSK | xor_flags;       /* pick up interlock */
2601                 state &= ~and_flags;                            /* clear flags */
2602
2603                 disable_preemption();
2604                 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
2605                         break;
2606                 }
2607                 enable_preemption();
2608                 cpu_pause();
2609                 state = ordered_load_mtx_state(mutex);
2610         }
2611         *new_state = state;
2612         return;
2613 }
2614
2615 static inline void
2616 lck_mtx_interlock_lock_clear_flags(
2617         lck_mtx_t *mutex,
2618         uint32_t and_flags,
2619         uint32_t *new_state)
2620 {
2621         return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2622 }
2623
2624 static inline void
2625 lck_mtx_interlock_lock(
2626         lck_mtx_t *mutex,
2627         uint32_t *new_state)
2628 {
2629         return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2630 }
2631
2632 static inline int
2633 lck_mtx_interlock_try_lock_set_flags(
2634         lck_mtx_t *mutex,
2635         uint32_t or_flags,
2636         uint32_t *new_state)
2637 {
2638         uint32_t state, prev;
2639         state = *new_state;
2640
2641         /* have to wait for interlock to clear */
2642         if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2643                 return 0;
2644         }
2645         prev = state;                                   /* prev contains snapshot for exchange */
2646         state |= LCK_MTX_ILOCKED_MSK | or_flags;        /* pick up interlock */
2647         disable_preemption();
2648         if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
2649                 *new_state = state;
2650                 return 1;
2651         }
2652
2653         enable_preemption();
2654         return 0;
2655 }
2656
2657 __attribute__((noinline))
2658 static void
2659 lck_mtx_lock_contended(
2660         lck_mtx_t       *lock,
2661         boolean_t indirect,
2662         boolean_t *first_miss)
2663 {
2664         lck_mtx_spinwait_ret_type_t ret;
2665         uint32_t state;
2666         thread_t thread;
2667         struct turnstile *ts = NULL;
2668
2669 try_again:
2670
2671         if (indirect) {
2672                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
2673         }
2674
2675         ret = lck_mtx_lock_spinwait_x86(lock);
2676         state = ordered_load_mtx_state(lock);
2677         switch (ret) {
2678         case LCK_MTX_SPINWAIT_NO_SPIN:
2679                 /*
2680                  * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2681                  * try to spin.
2682                  */
2683                 if (indirect) {
2684                         lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
2685                 }
2686
2687         /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2688         case LCK_MTX_SPINWAIT_SPUN_HIGH_THR:
2689         case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE:
2690         case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION:
2691         case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR:
2692                 /*
2693                  * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2694                  * interlock not held
2695                  */
2696                 lck_mtx_interlock_lock(lock, &state);
2697                 assert(state & LCK_MTX_ILOCKED_MSK);
2698
2699                 if (state & LCK_MTX_MLOCKED_MSK) {
2700                         if (indirect) {
2701                                 lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
2702                         }
2703                         lck_mtx_lock_wait_x86(lock, &ts);
2704                         /*
2705                          * interlock is not held here.
2706                          */
2707                         goto try_again;
2708                 } else {
2709                         /* grab the mutex */
2710                         state |= LCK_MTX_MLOCKED_MSK;
2711                         ordered_store_mtx_state_release(lock, state);
2712                         thread = current_thread();
2713                         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2714 #if     MACH_LDEBUG
2715                         if (thread) {
2716                                 thread->mutex_count++;
2717                         }
2718 #endif  /* MACH_LDEBUG */
2719                 }
2720
2721                 break;
2722         case LCK_MTX_SPINWAIT_ACQUIRED:
2723                 /*
2724                  * mutex has been acquired by lck_mtx_lock_spinwait_x86
2725                  * interlock is held and preemption disabled
2726                  * owner is set and mutex marked as locked
2727                  * statistics updated too
2728                  */
2729                 break;
2730         default:
2731                 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2732         }
2733
2734         /*
2735          * interlock is already acquired here
2736          */
2737
2738         /* mutex has been acquired */
2739         thread = (thread_t)lock->lck_mtx_owner;
2740         if (state & LCK_MTX_WAITERS_MSK) {
2741                 /*
2742                  * lck_mtx_lock_acquire_tail will call
2743                  * turnstile_complete.
2744                  */
2745                 return lck_mtx_lock_acquire_tail(lock, indirect, ts);
2746         }
2747
2748         if (ts != NULL) {
2749                 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2750         }
2751
2752         assert(current_thread()->turnstile != NULL);
2753
2754         /* release the interlock */
2755         lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock), indirect);
2756 }
2757
2758 /*
2759  * Helper noinline functions for calling
2760  * panic to optimize compiled code.
2761  */
2762
2763 __attribute__((noinline)) __abortlike
2764 static void
2765 lck_mtx_destroyed(
2766         lck_mtx_t       *lock)
2767 {
2768         panic("trying to interlock destroyed mutex (%p)", lock);
2769 }
2770
2771 __attribute__((noinline))
2772 static boolean_t
2773 lck_mtx_try_destroyed(
2774         lck_mtx_t       *lock)
2775 {
2776         panic("trying to interlock destroyed mutex (%p)", lock);
2777         return FALSE;
2778 }
2779
2780 __attribute__((always_inline))
2781 static boolean_t
2782 lck_mtx_lock_wait_interlock_to_clear(
2783         lck_mtx_t       *lock,
2784         uint32_t*        new_state)
2785 {
2786         uint32_t state;
2787
2788         for (;;) {
2789                 cpu_pause();
2790                 state = ordered_load_mtx_state(lock);
2791                 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2792                         *new_state = state;
2793                         return TRUE;
2794                 }
2795                 if (state & LCK_MTX_MLOCKED_MSK) {
2796                         /* if it is held as mutex, just fail */
2797                         return FALSE;
2798                 }
2799         }
2800 }
2801
2802 __attribute__((always_inline))
2803 static boolean_t
2804 lck_mtx_try_lock_wait_interlock_to_clear(
2805         lck_mtx_t       *lock,
2806         uint32_t*        new_state)
2807 {
2808         uint32_t state;
2809
2810         for (;;) {
2811                 cpu_pause();
2812                 state = ordered_load_mtx_state(lock);
2813                 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2814                         /* if it is held as mutex or spin, just fail */
2815                         return FALSE;
2816                 }
2817                 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2818                         *new_state = state;
2819                         return TRUE;
2820                 }
2821         }
2822 }
2823
2824 /*
2825  * Routine:     lck_mtx_lock_slow
2826  *
2827  * Locks a mutex for current thread.
2828  * If the lock is contended this function might
2829  * sleep.
2830  *
2831  * Called with interlock not held.
2832  */
2833 __attribute__((noinline))
2834 void
2835 lck_mtx_lock_slow(
2836         lck_mtx_t       *lock)
2837 {
2838         boolean_t       indirect = FALSE;
2839         uint32_t        state;
2840         int             first_miss = 0;
2841
2842         state = ordered_load_mtx_state(lock);
2843
2844         /* is the interlock or mutex held */
2845         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2846                 /*
2847                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2848                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2849                  * set in state (state == lck_mtx_tag)
2850                  */
2851
2852
2853                 /* is the mutex already held and not indirect */
2854                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
2855                         /* no, must have been the mutex */
2856                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2857                 }
2858
2859                 /* check to see if it is marked destroyed */
2860                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2861                         lck_mtx_destroyed(lock);
2862                 }
2863
2864                 /* Is this an indirect mutex? */
2865                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2866                         indirect = get_indirect_mutex(&lock, &state);
2867
2868                         first_miss = 0;
2869                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2870
2871                         if (state & LCK_MTX_SPIN_MSK) {
2872                                 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2873                                 assert(state & LCK_MTX_ILOCKED_MSK);
2874                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2875                         }
2876                 }
2877
2878                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2879                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2880                 }
2881         }
2882
2883         /* no - can't be INDIRECT, DESTROYED or locked */
2884         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2885                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2886                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2887                 }
2888         }
2889
2890         /* lock and interlock acquired */
2891
2892         thread_t thread = current_thread();
2893         /* record owner of mutex */
2894         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2895
2896 #if MACH_LDEBUG
2897         if (thread) {
2898                 thread->mutex_count++;          /* lock statistic */
2899         }
2900 #endif
2901         /*
2902          * Check if there are waiters to
2903          * inherit their priority.
2904          */
2905         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2906                 return lck_mtx_lock_acquire_tail(lock, indirect, NULL);
2907         }
2908
2909         /* release the interlock */
2910         lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2911
2912         return;
2913 }
2914
2915 __attribute__((noinline))
2916 boolean_t
2917 lck_mtx_try_lock_slow(
2918         lck_mtx_t       *lock)
2919 {
2920         boolean_t       indirect = FALSE;
2921         uint32_t        state;
2922         int             first_miss = 0;
2923
2924         state = ordered_load_mtx_state(lock);
2925
2926         /* is the interlock or mutex held */
2927         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2928                 /*
2929                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2930                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2931                  * set in state (state == lck_mtx_tag)
2932                  */
2933
2934                 /* is the mutex already held and not indirect */
2935                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
2936                         return FALSE;
2937                 }
2938
2939                 /* check to see if it is marked destroyed */
2940                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2941                         lck_mtx_try_destroyed(lock);
2942                 }
2943
2944                 /* Is this an indirect mutex? */
2945                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2946                         indirect = get_indirect_mutex(&lock, &state);
2947
2948                         first_miss = 0;
2949                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2950                 }
2951
2952                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
2953                         if (indirect) {
2954                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2955                         }
2956                         return FALSE;
2957                 }
2958         }
2959
2960         /* no - can't be INDIRECT, DESTROYED or locked */
2961         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2962                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
2963                         if (indirect) {
2964                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2965                         }
2966                         return FALSE;
2967                 }
2968         }
2969
2970         /* lock and interlock acquired */
2971
2972         thread_t thread = current_thread();
2973         /* record owner of mutex */
2974         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2975
2976 #if MACH_LDEBUG
2977         if (thread) {
2978                 thread->mutex_count++;          /* lock statistic */
2979         }
2980 #endif
2981         /*
2982          * Check if there are waiters to
2983          * inherit their priority.
2984          */
2985         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2986                 return lck_mtx_try_lock_acquire_tail(lock);
2987         }
2988
2989         /* release the interlock */
2990         lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
2991
2992         return TRUE;
2993 }
2994
2995 __attribute__((noinline))
2996 void
2997 lck_mtx_lock_spin_slow(
2998         lck_mtx_t       *lock)
2999 {
3000         boolean_t       indirect = FALSE;
3001         uint32_t        state;
3002         int             first_miss = 0;
3003
3004         state = ordered_load_mtx_state(lock);
3005
3006         /* is the interlock or mutex held */
3007         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3008                 /*
3009                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3010                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3011                  * set in state (state == lck_mtx_tag)
3012                  */
3013
3014
3015                 /* is the mutex already held and not indirect */
3016                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3017                         /* no, must have been the mutex */
3018                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3019                 }
3020
3021                 /* check to see if it is marked destroyed */
3022                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3023                         lck_mtx_destroyed(lock);
3024                 }
3025
3026                 /* Is this an indirect mutex? */
3027                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3028                         indirect = get_indirect_mutex(&lock, &state);
3029
3030                         first_miss = 0;
3031                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3032
3033                         if (state & LCK_MTX_SPIN_MSK) {
3034                                 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3035                                 assert(state & LCK_MTX_ILOCKED_MSK);
3036                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3037                         }
3038                 }
3039
3040                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3041                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3042                 }
3043         }
3044
3045         /* no - can't be INDIRECT, DESTROYED or locked */
3046         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3047                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3048                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3049                 }
3050         }
3051
3052         /* lock as spinlock and interlock acquired */
3053
3054         thread_t thread = current_thread();
3055         /* record owner of mutex */
3056         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3057
3058 #if MACH_LDEBUG
3059         if (thread) {
3060                 thread->mutex_count++;          /* lock statistic */
3061         }
3062 #endif
3063
3064 #if     CONFIG_DTRACE
3065         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3066 #endif
3067         /* return with the interlock held and preemption disabled */
3068         return;
3069 }
3070
3071 __attribute__((noinline))
3072 boolean_t
3073 lck_mtx_try_lock_spin_slow(
3074         lck_mtx_t       *lock)
3075 {
3076         boolean_t       indirect = FALSE;
3077         uint32_t        state;
3078         int             first_miss = 0;
3079
3080         state = ordered_load_mtx_state(lock);
3081
3082         /* is the interlock or mutex held */
3083         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3084                 /*
3085                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3086                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3087                  * set in state (state == lck_mtx_tag)
3088                  */
3089
3090                 /* is the mutex already held and not indirect */
3091                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3092                         return FALSE;
3093                 }
3094
3095                 /* check to see if it is marked destroyed */
3096                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3097                         lck_mtx_try_destroyed(lock);
3098                 }
3099
3100                 /* Is this an indirect mutex? */
3101                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3102                         indirect = get_indirect_mutex(&lock, &state);
3103
3104                         first_miss = 0;
3105                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3106                 }
3107
3108                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3109                         if (indirect) {
3110                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3111                         }
3112                         return FALSE;
3113                 }
3114         }
3115
3116         /* no - can't be INDIRECT, DESTROYED or locked */
3117         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3118                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3119                         if (indirect) {
3120                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3121                         }
3122                         return FALSE;
3123                 }
3124         }
3125
3126         /* lock and interlock acquired */
3127
3128         thread_t thread = current_thread();
3129         /* record owner of mutex */
3130         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3131
3132 #if MACH_LDEBUG
3133         if (thread) {
3134                 thread->mutex_count++;          /* lock statistic */
3135         }
3136 #endif
3137
3138 #if     CONFIG_DTRACE
3139         LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3140 #endif
3141         return TRUE;
3142 }
3143
3144 __attribute__((noinline))
3145 void
3146 lck_mtx_convert_spin(
3147         lck_mtx_t       *lock)
3148 {
3149         uint32_t state;
3150
3151         state = ordered_load_mtx_state(lock);
3152
3153         /* Is this an indirect mutex? */
3154         if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3155                 /* If so, take indirection */
3156                 get_indirect_mutex(&lock, &state);
3157         }
3158
3159         assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3160
3161         if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3162                 /* already owned as a mutex, just return */
3163                 return;
3164         }
3165
3166         assert(get_preemption_level() > 0);
3167         assert(state & LCK_MTX_ILOCKED_MSK);
3168         assert(state & LCK_MTX_SPIN_MSK);
3169
3170         /*
3171          * Check if there are waiters to
3172          * inherit their priority.
3173          */
3174         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3175                 return lck_mtx_convert_spin_acquire_tail(lock);
3176         }
3177
3178         lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3179
3180         return;
3181 }
3182
3183 static inline boolean_t
3184 lck_mtx_lock_grab_mutex(
3185         lck_mtx_t       *lock)
3186 {
3187         uint32_t state;
3188
3189         state = ordered_load_mtx_state(lock);
3190
3191         if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3192                 return FALSE;
3193         }
3194
3195         /* lock and interlock acquired */
3196
3197         thread_t thread = current_thread();
3198         /* record owner of mutex */
3199         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3200
3201 #if MACH_LDEBUG
3202         if (thread) {
3203                 thread->mutex_count++;          /* lock statistic */
3204         }
3205 #endif
3206         return TRUE;
3207 }
3208
3209 __attribute__((noinline))
3210 void
3211 lck_mtx_assert(
3212         lck_mtx_t       *lock,
3213         unsigned int    type)
3214 {
3215         thread_t thread, owner;
3216         uint32_t state;
3217
3218         thread = current_thread();
3219         state = ordered_load_mtx_state(lock);
3220
3221         if (state == LCK_MTX_TAG_INDIRECT) {
3222                 get_indirect_mutex(&lock, &state);
3223         }
3224
3225         owner = (thread_t)lock->lck_mtx_owner;
3226
3227         if (type == LCK_MTX_ASSERT_OWNED) {
3228                 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
3229                         panic("mutex (%p) not owned\n", lock);
3230                 }
3231         } else {
3232                 assert(type == LCK_MTX_ASSERT_NOTOWNED);
3233                 if (owner == thread) {
3234                         panic("mutex (%p) owned\n", lock);
3235                 }
3236         }
3237 }
3238
3239 /*
3240  * Routine:     lck_mtx_lock_spinwait_x86
3241  *
3242  * Invoked trying to acquire a mutex when there is contention but
3243  * the holder is running on another processor. We spin for up to a maximum
3244  * time waiting for the lock to be released.
3245  *
3246  * Called with the interlock unlocked.
3247  * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3248  * returns LCK_MTX_SPINWAIT_SPUN if we spun
3249  * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3250  */
3251 __attribute__((noinline))
3252 lck_mtx_spinwait_ret_type_t
3253 lck_mtx_lock_spinwait_x86(
3254         lck_mtx_t       *mutex)
3255 {
3256         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3257         thread_t        owner, prev_owner;
3258         uint64_t        window_deadline, sliding_deadline, high_deadline;
3259         uint64_t        start_time, cur_time, avg_hold_time, bias, delta;
3260         lck_mtx_spinwait_ret_type_t             retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
3261         int             loopcount = 0;
3262         int             total_hold_time_samples, window_hold_time_samples, unfairness;
3263         uint            i, prev_owner_cpu;
3264         bool            owner_on_core, adjust;
3265
3266         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
3267             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
3268
3269         start_time = mach_absolute_time();
3270         /*
3271          * window_deadline represents the "learning" phase.
3272          * The thread collects statistics about the lock during
3273          * window_deadline and then it makes a decision on whether to spin more
3274          * or block according to the concurrency behavior
3275          * observed.
3276          *
3277          * Every thread can spin at least low_MutexSpin.
3278          */
3279         window_deadline = start_time + low_MutexSpin;
3280         /*
3281          * Sliding_deadline is the adjusted spin deadline
3282          * computed after the "learning" phase.
3283          */
3284         sliding_deadline = window_deadline;
3285         /*
3286          * High_deadline is a hard deadline. No thread
3287          * can spin more than this deadline.
3288          */
3289         if (high_MutexSpin >= 0) {
3290                 high_deadline = start_time + high_MutexSpin;
3291         } else {
3292                 high_deadline = start_time + low_MutexSpin * real_ncpus;
3293         }
3294
3295         /*
3296          * Do not know yet which is the owner cpu.
3297          * Initialize prev_owner_cpu with next cpu.
3298          */
3299         prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
3300         total_hold_time_samples = 0;
3301         window_hold_time_samples = 0;
3302         avg_hold_time = 0;
3303         adjust = TRUE;
3304         bias = (os_hash_kernel_pointer(mutex) + cpu_number()) % real_ncpus;
3305
3306         prev_owner = (thread_t) mutex->lck_mtx_owner;
3307         /*
3308          * Spin while:
3309          *   - mutex is locked, and
3310          *   - it's locked as a spin lock, and
3311          *   - owner is running on another processor, and
3312          *   - we haven't spun for long enough.
3313          */
3314         do {
3315                 /*
3316                  * Try to acquire the lock.
3317                  */
3318                 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
3319                         retval = LCK_MTX_SPINWAIT_ACQUIRED;
3320                         break;
3321                 }
3322
3323                 cur_time = mach_absolute_time();
3324
3325                 /*
3326                  * Never spin past high_deadline.
3327                  */
3328                 if (cur_time >= high_deadline) {
3329                         retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
3330                         break;
3331                 }
3332
3333                 /*
3334                  * Check if owner is on core. If not block.
3335                  */
3336                 owner = (thread_t) mutex->lck_mtx_owner;
3337                 if (owner) {
3338                         i = prev_owner_cpu;
3339                         owner_on_core = FALSE;
3340
3341                         disable_preemption();
3342                         owner = (thread_t) mutex->lck_mtx_owner;
3343
3344                         /*
3345                          * For scalability we want to check if the owner is on core
3346                          * without locking the mutex interlock.
3347                          * If we do not lock the mutex interlock, the owner that we see might be
3348                          * invalid, so we cannot dereference it. Therefore we cannot check
3349                          * any field of the thread to tell us if it is on core.
3350                          * Check if the thread that is running on the other cpus matches the owner.
3351                          */
3352                         if (owner) {
3353                                 do {
3354                                         if ((cpu_data_ptr[i] != NULL) && (cpu_data_ptr[i]->cpu_active_thread == owner)) {
3355                                                 owner_on_core = TRUE;
3356                                                 break;
3357                                         }
3358                                         if (++i >= real_ncpus) {
3359                                                 i = 0;
3360                                         }
3361                                 } while (i != prev_owner_cpu);
3362                                 enable_preemption();
3363
3364                                 if (owner_on_core) {
3365                                         prev_owner_cpu = i;
3366                                 } else {
3367                                         prev_owner = owner;
3368                                         owner = (thread_t) mutex->lck_mtx_owner;
3369                                         if (owner == prev_owner) {
3370                                                 /*
3371                                                  * Owner is not on core.
3372                                                  * Stop spinning.
3373                                                  */
3374                                                 if (loopcount == 0) {
3375                                                         retval = LCK_MTX_SPINWAIT_NO_SPIN;
3376                                                 } else {
3377                                                         retval = LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE;
3378                                                 }
3379                                                 break;
3380                                         }
3381                                         /*
3382                                          * Fall through if the owner changed while we were scanning.
3383                                          * The new owner could potentially be on core, so loop
3384                                          * again.
3385                                          */
3386                                 }
3387                         } else {
3388                                 enable_preemption();
3389                         }
3390                 }
3391
3392                 /*
3393                  * Save how many times we see the owner changing.
3394                  * We can roughly estimate the mutex hold
3395                  * time and the fairness with that.
3396                  */
3397                 if (owner != prev_owner) {
3398                         prev_owner = owner;
3399                         total_hold_time_samples++;
3400                         window_hold_time_samples++;
3401                 }
3402
3403                 /*
3404                  * Learning window expired.
3405                  * Try to adjust the sliding_deadline.
3406                  */
3407                 if (cur_time >= window_deadline) {
3408                         /*
3409                          * If there was not contention during the window
3410                          * stop spinning.
3411                          */
3412                         if (window_hold_time_samples < 1) {
3413                                 retval = LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION;
3414                                 break;
3415                         }
3416
3417                         if (adjust) {
3418                                 /*
3419                                  * For a fair lock, we'd wait for at most (NCPU-1) periods,
3420                                  * but the lock is unfair, so let's try to estimate by how much.
3421                                  */
3422                                 unfairness = total_hold_time_samples / real_ncpus;
3423
3424                                 if (unfairness == 0) {
3425                                         /*
3426                                          * We observed the owner changing `total_hold_time_samples` times which
3427                                          * let us estimate the average hold time of this mutex for the duration
3428                                          * of the spin time.
3429                                          * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
3430                                          *
3431                                          * In this case spin at max avg_hold_time * (real_ncpus - 1)
3432                                          */
3433                                         delta = cur_time - start_time;
3434                                         sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
3435                                 } else {
3436                                         /*
3437                                          * In this case at least one of the other cpus was able to get the lock twice
3438                                          * while I was spinning.
3439                                          * We could spin longer but it won't necessarily help if the system is unfair.
3440                                          * Try to randomize the wait to reduce contention.
3441                                          *
3442                                          * We compute how much time we could potentially spin
3443                                          * and distribute it over the cpus.
3444                                          *
3445                                          * bias is an integer between 0 and real_ncpus.
3446                                          * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
3447                                          */
3448                                         delta = high_deadline - cur_time;
3449                                         sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
3450                                         adjust = FALSE;
3451                                 }
3452                         }
3453
3454                         window_deadline += low_MutexSpin;
3455                         window_hold_time_samples = 0;
3456                 }
3457
3458                 /*
3459                  * Stop spinning if we past
3460                  * the adjusted deadline.
3461                  */
3462                 if (cur_time >= sliding_deadline) {
3463                         retval = LCK_MTX_SPINWAIT_SPUN_SLIDING_THR;
3464                         break;
3465                 }
3466
3467                 if ((thread_t) mutex->lck_mtx_owner != NULL) {
3468                         cpu_pause();
3469                 }
3470
3471                 loopcount++;
3472         } while (TRUE);
3473
3474 #if     CONFIG_DTRACE
3475         /*
3476          * Note that we record a different probe id depending on whether
3477          * this is a direct or indirect mutex.  This allows us to
3478          * penalize only lock groups that have debug/stats enabled
3479          * with dtrace processing if desired.
3480          */
3481         if (__probable(mutex->lck_mtx_is_ext == 0)) {
3482                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3483                     mach_absolute_time() - start_time);
3484         } else {
3485                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3486                     mach_absolute_time() - start_time);
3487         }
3488         /* The lockstat acquire event is recorded by the assembly code beneath us. */
3489 #endif
3490
3491         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
3492             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
3493
3494         return retval;
3495 }
3496
3497
3498
3499 /*
3500  * Routine:     lck_mtx_lock_wait_x86
3501  *
3502  * Invoked in order to wait on contention.
3503  *
3504  * Called with the interlock locked and
3505  * preemption disabled...
3506  * returns it unlocked and with preemption enabled
3507  *
3508  * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3509  *      A runnable waiter can exist between wait and acquire
3510  *      without a waiters count being set.
3511  *      This allows us to never make a spurious wakeup call.
3512  *
3513  * Priority:
3514  *      This avoids taking the thread lock if the owning thread is the same priority.
3515  *      This optimizes the case of same-priority threads contending on a lock.
3516  *      However, that allows the owning thread to drop in priority while holding the lock,
3517  *      because there is no state that the priority change can notice that
3518  *      says that the targeted thread holds a contended mutex.
3519  *
3520  *      One possible solution: priority changes could look for some atomic tag
3521  *      on the thread saying 'holding contended lock', and then set up a promotion.
3522  *      Needs a story for dropping that promotion - the last contended unlock
3523  *      has to notice that this has happened.
3524  */
3525 __attribute__((noinline))
3526 void
3527 lck_mtx_lock_wait_x86(
3528         lck_mtx_t       *mutex,
3529         struct turnstile **ts)
3530 {
3531         thread_t self = current_thread();
3532
3533 #if     CONFIG_DTRACE
3534         uint64_t sleep_start = 0;
3535
3536         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3537                 sleep_start = mach_absolute_time();
3538         }
3539 #endif
3540         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3541
3542         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
3543             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3544             mutex->lck_mtx_waiters, 0, 0);
3545
3546         assert(self->waiting_for_mutex == NULL);
3547         self->waiting_for_mutex = mutex;
3548         mutex->lck_mtx_waiters++;
3549
3550         thread_t holder = (thread_t)mutex->lck_mtx_owner;
3551         assert(holder != NULL);
3552
3553         /*
3554          * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
3555          * the same turnstile while looping, the matching turnstile compleate will be called
3556          * by lck_mtx_lock_contended when finally acquiring the lock.
3557          */
3558         if (*ts == NULL) {
3559                 *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
3560         }
3561
3562         struct turnstile *turnstile = *ts;
3563         thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3564         turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
3565
3566         waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
3567
3568         lck_mtx_ilk_unlock(mutex);
3569
3570         turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
3571
3572         thread_block(THREAD_CONTINUE_NULL);
3573
3574         self->waiting_for_mutex = NULL;
3575
3576         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
3577             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3578             mutex->lck_mtx_waiters, 0, 0);
3579
3580 #if     CONFIG_DTRACE
3581         /*
3582          * Record the Dtrace lockstat probe for blocking, block time
3583          * measured from when we were entered.
3584          */
3585         if (sleep_start) {
3586                 if (mutex->lck_mtx_is_ext == 0) {
3587                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3588                             mach_absolute_time() - sleep_start);
3589                 } else {
3590                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3591                             mach_absolute_time() - sleep_start);
3592                 }
3593         }
3594 #endif
3595 }
3596
3597 /*
3598  *      Routine: kdp_lck_mtx_lock_spin_is_acquired
3599  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3600  *      Returns: TRUE if lock is acquired.
3601  */
3602 boolean_t
3603 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t     *lck)
3604 {
3605         if (not_in_kdp) {
3606                 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3607         }
3608
3609         if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3610                 return TRUE;
3611         }
3612
3613         return FALSE;
3614 }
3615
3616 void
3617 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3618 {
3619         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3620         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3621         thread_t holder   = (thread_t)mutex->lck_mtx_owner;
3622         waitinfo->owner   = thread_tid(holder);
3623 }
3624
3625 void
3626 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3627 {
3628         lck_rw_t *rwlck = NULL;
3629         switch (waitinfo->wait_type) {
3630         case kThreadWaitKernelRWLockRead:
3631                 rwlck = READ_EVENT_TO_RWLOCK(event);
3632                 break;
3633         case kThreadWaitKernelRWLockWrite:
3634         case kThreadWaitKernelRWLockUpgrade:
3635                 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3636                 break;
3637         default:
3638                 panic("%s was called with an invalid blocking type", __FUNCTION__);
3639                 break;
3640         }
3641         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3642         waitinfo->owner = 0;
3643 }