osfmk/i386/locks_i386.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  *      File:   kern/lock.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *      Date:   1985
  60  *
  61  *      Locking primitives implementation
  62  */
  63
  64 #define ATOMIC_PRIVATE 1
  65 #define LOCK_PRIVATE 1
  66
  67 #include <mach_ldebug.h>
  68
  69 #include <kern/locks.h>
  70 #include <kern/kalloc.h>
  71 #include <kern/misc_protos.h>
  72 #include <kern/thread.h>
  73 #include <kern/processor.h>
  74 #include <kern/cpu_data.h>
  75 #include <kern/cpu_number.h>
  76 #include <kern/sched_prim.h>
  77 #include <kern/xpr.h>
  78 #include <kern/debug.h>
  79 #include <string.h>
  80
  81 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
  82 #include <machine/atomic.h>
  83 #include <machine/machine_cpu.h>
  84 #include <i386/mp.h>
  85 #include <machine/atomic.h>
  86 #include <sys/kdebug.h>
  87 #include <i386/locks_i386_inlines.h>
  88
  89 /*
  90  * We need only enough declarations from the BSD-side to be able to
  91  * test if our probe is active, and to call __dtrace_probe().  Setting
  92  * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
  93  */
  94 #if     CONFIG_DTRACE
  95 #define NEED_DTRACE_DEFS
  96 #include <../bsd/sys/lockstat.h>
  97
  98 #define DTRACE_RW_SHARED        0x0     //reader
  99 #define DTRACE_RW_EXCL          0x1     //writer
 100 #define DTRACE_NO_FLAG          0x0     //not applicable
 101
 102 #endif
 103
 104 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
 105 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
 106 #define LCK_RW_LCK_SHARED_CODE          0x102
 107 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
 108 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
 109 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 110
 111 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
 112 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
 113 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
 114 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
 115 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
 116 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
 117 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
 118 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
 119
 120
 121 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 122
 123 unsigned int LcksOpts=0;
 124
 125 #if DEVELOPMENT || DEBUG
 126 unsigned int LckDisablePreemptCheck = 0;
 127 #endif
 128
 129 /* Forwards */
 130
 131 #if     USLOCK_DEBUG
 132 /*
 133  *      Perform simple lock checks.
 134  */
 135 int     uslock_check = 1;
 136 int     max_lock_loops  = 100000000;
 137 decl_simple_lock_data(extern , printf_lock)
 138 decl_simple_lock_data(extern , panic_lock)
 139 #endif  /* USLOCK_DEBUG */
 140
 141 extern unsigned int not_in_kdp;
 142
 143 /*
 144  *      We often want to know the addresses of the callers
 145  *      of the various lock routines.  However, this information
 146  *      is only used for debugging and statistics.
 147  */
 148 typedef void    *pc_t;
 149 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 150 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 151 #if     ANY_LOCK_DEBUG
 152 #define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
 153 #define DECL_PC(pc)     pc_t pc;
 154 #else   /* ANY_LOCK_DEBUG */
 155 #define DECL_PC(pc)
 156 #ifdef  lint
 157 /*
 158  *      Eliminate lint complaints about unused local pc variables.
 159  */
 160 #define OBTAIN_PC(pc)   ++pc
 161 #else   /* lint */
 162 #define OBTAIN_PC(pc)
 163 #endif  /* lint */
 164 #endif  /* USLOCK_DEBUG */
 165
 166 /*
 167  * atomic exchange API is a low level abstraction of the operations
 168  * to atomically read, modify, and write a pointer.  This abstraction works
 169  * for both Intel and ARMv8.1 compare and exchange atomic instructions as
 170  * well as the ARM exclusive instructions.
 171  *
 172  * atomic_exchange_begin() - begin exchange and retrieve current value
 173  * atomic_exchange_complete() - conclude an exchange
 174  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
 175  */
 176 static uint32_t
 177 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 178 {
 179         uint32_t        val;
 180
 181         (void)ord;                      // Memory order not used
 182         val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed);
 183         *previous = val;
 184         return val;
 185 }
 186
 187 static boolean_t
 188 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 189 {
 190         return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
 191 }
 192
 193 static void
 194 atomic_exchange_abort(void) { }
 195
 196 static boolean_t
 197 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 198 {
 199         uint32_t        value, prev;
 200
 201         for ( ; ; ) {
 202                 value = atomic_exchange_begin32(target, &prev, ord);
 203                 if (value & test_mask) {
 204                         if (wait)
 205                                 cpu_pause();
 206                         else
 207                                 atomic_exchange_abort();
 208                         return FALSE;
 209                 }
 210                 value |= set_mask;
 211                 if (atomic_exchange_complete32(target, prev, value, ord))
 212                         return TRUE;
 213         }
 214 }
 215
 216 /*
 217  *      Portable lock package implementation of usimple_locks.
 218  */
 219
 220 #if     USLOCK_DEBUG
 221 #define USLDBG(stmt)    stmt
 222 void            usld_lock_init(usimple_lock_t, unsigned short);
 223 void            usld_lock_pre(usimple_lock_t, pc_t);
 224 void            usld_lock_post(usimple_lock_t, pc_t);
 225 void            usld_unlock(usimple_lock_t, pc_t);
 226 void            usld_lock_try_pre(usimple_lock_t, pc_t);
 227 void            usld_lock_try_post(usimple_lock_t, pc_t);
 228 int             usld_lock_common_checks(usimple_lock_t, char *);
 229 #else   /* USLOCK_DEBUG */
 230 #define USLDBG(stmt)
 231 #endif  /* USLOCK_DEBUG */
 232
 233 /*
 234  * Forward definitions
 235  */
 236
 237 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
 238 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
 239 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 240 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 241 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 242 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 243 void lck_rw_clear_promotions_x86(thread_t thread);
 244 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
 245 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
 246 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
 247 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, int prior_lock_state, boolean_t indirect);
 248 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
 249 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
 250 static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state);
 251 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
 252 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
 253 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
 254
 255
 256 /*
 257  *      Routine:        lck_spin_alloc_init
 258  */
 259 lck_spin_t *
 260 lck_spin_alloc_init(
 261         lck_grp_t       *grp,
 262         lck_attr_t      *attr)
 263 {
 264         lck_spin_t      *lck;
 265
 266         if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
 267                 lck_spin_init(lck, grp, attr);
 268
 269         return(lck);
 270 }
 271
 272 /*
 273  *      Routine:        lck_spin_free
 274  */
 275 void
 276 lck_spin_free(
 277         lck_spin_t      *lck,
 278         lck_grp_t       *grp)
 279 {
 280         lck_spin_destroy(lck, grp);
 281         kfree(lck, sizeof(lck_spin_t));
 282 }
 283
 284 /*
 285  *      Routine:        lck_spin_init
 286  */
 287 void
 288 lck_spin_init(
 289         lck_spin_t      *lck,
 290         lck_grp_t       *grp,
 291         __unused lck_attr_t     *attr)
 292 {
 293         usimple_lock_init((usimple_lock_t) lck, 0);
 294         lck_grp_reference(grp);
 295         lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 296 }
 297
 298 /*
 299  *      Routine:        lck_spin_destroy
 300  */
 301 void
 302 lck_spin_destroy(
 303         lck_spin_t      *lck,
 304         lck_grp_t       *grp)
 305 {
 306         if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
 307                 return;
 308         lck->interlock = LCK_SPIN_TAG_DESTROYED;
 309         lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 310         lck_grp_deallocate(grp);
 311         return;
 312 }
 313
 314 /*
 315  *      Routine:        lck_spin_lock
 316  */
 317 void
 318 lck_spin_lock(
 319         lck_spin_t      *lck)
 320 {
 321         usimple_lock((usimple_lock_t) lck);
 322 }
 323
 324 /*
 325  *      Routine:        lck_spin_unlock
 326  */
 327 void
 328 lck_spin_unlock(
 329         lck_spin_t      *lck)
 330 {
 331         usimple_unlock((usimple_lock_t) lck);
 332 }
 333
 334
 335 /*
 336  *      Routine:        lck_spin_try_lock
 337  */
 338 boolean_t
 339 lck_spin_try_lock(
 340         lck_spin_t      *lck)
 341 {
 342         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
 343 #if     DEVELOPMENT || DEBUG
 344         if (lrval) {
 345                 pltrace(FALSE);
 346         }
 347 #endif
 348         return(lrval);
 349 }
 350
 351 /*
 352  *      Routine:        lck_spin_assert
 353  */
 354 void
 355 lck_spin_assert(lck_spin_t *lock, unsigned int type)
 356 {
 357         thread_t thread, holder;
 358         uintptr_t state;
 359
 360         if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
 361                 panic("lck_spin_assert(): invalid arg (%u)", type);
 362         }
 363
 364         state = lock->interlock;
 365         holder = (thread_t)state;
 366         thread = current_thread();
 367         if (type == LCK_ASSERT_OWNED) {
 368                 if (__improbable(holder == THREAD_NULL)) {
 369                         panic("Lock not owned %p = %lx", lock, state);
 370                 }
 371                 if (__improbable(holder != thread)) {
 372                         panic("Lock not owned by current thread %p = %lx", lock, state);
 373                 }
 374         } else if (type == LCK_ASSERT_NOTOWNED) {
 375                 if (__improbable(holder != THREAD_NULL)) {
 376                         if (holder == thread) {
 377                                 panic("Lock owned by current thread %p = %lx", lock, state);
 378                         } else {
 379                                 panic("Lock %p owned by thread %p", lock, holder);
 380                         }
 381                 }
 382         }
 383 }
 384
 385 /*
 386  *      Routine: kdp_lck_spin_is_acquired
 387  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 388  *      Returns: TRUE if lock is acquired.
 389  */
 390 boolean_t
 391 kdp_lck_spin_is_acquired(lck_spin_t *lck) {
 392         if (not_in_kdp) {
 393                 panic("panic: spinlock acquired check done outside of kernel debugger");
 394         }
 395         return (lck->interlock != 0)? TRUE : FALSE;
 396 }
 397
 398 /*
 399  *      Initialize a usimple_lock.
 400  *
 401  *      No change in preemption state.
 402  */
 403 void
 404 usimple_lock_init(
 405         usimple_lock_t  l,
 406         __unused unsigned short tag)
 407 {
 408 #ifndef MACHINE_SIMPLE_LOCK
 409         USLDBG(usld_lock_init(l, tag));
 410         hw_lock_init(&l->interlock);
 411 #else
 412         simple_lock_init((simple_lock_t)l,tag);
 413 #endif
 414 }
 415
 416 volatile uint32_t spinlock_owner_cpu = ~0;
 417 volatile usimple_lock_t spinlock_timed_out;
 418
 419 uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
 420         uint32_t i;
 421
 422         for (i = 0; i < real_ncpus; i++) {
 423                 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
 424                         spinlock_owner_cpu = i;
 425                         if ((uint32_t) cpu_number() != i) {
 426                                 /* Cause NMI and panic on the owner's cpu */
 427                                 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
 428                         }
 429                         break;
 430                 }
 431         }
 432
 433         return spinlock_owner_cpu;
 434 }
 435
 436 /*
 437  *      Acquire a usimple_lock.
 438  *
 439  *      Returns with preemption disabled.  Note
 440  *      that the hw_lock routines are responsible for
 441  *      maintaining preemption state.
 442  */
 443 void
 444 usimple_lock(
 445         usimple_lock_t  l)
 446 {
 447 #ifndef MACHINE_SIMPLE_LOCK
 448         DECL_PC(pc);
 449
 450         OBTAIN_PC(pc);
 451         USLDBG(usld_lock_pre(l, pc));
 452
 453         if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))        {
 454                 boolean_t uslock_acquired = FALSE;
 455                 while (machine_timeout_suspended()) {
 456                         enable_preemption();
 457                         if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
 458                                 break;
 459                 }
 460
 461                 if (uslock_acquired == FALSE) {
 462                         uint32_t lock_cpu;
 463                         uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
 464                         spinlock_timed_out = l;
 465                         lock_cpu = spinlock_timeout_NMI(lowner);
 466                         panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
 467                               l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
 468                 }
 469         }
 470 #if DEVELOPMENT || DEBUG
 471                 pltrace(FALSE);
 472 #endif
 473
 474         USLDBG(usld_lock_post(l, pc));
 475 #else
 476         simple_lock((simple_lock_t)l);
 477 #endif
 478 #if CONFIG_DTRACE
 479         LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0);
 480 #endif
 481 }
 482
 483
 484 /*
 485  *      Release a usimple_lock.
 486  *
 487  *      Returns with preemption enabled.  Note
 488  *      that the hw_lock routines are responsible for
 489  *      maintaining preemption state.
 490  */
 491 void
 492 usimple_unlock(
 493         usimple_lock_t  l)
 494 {
 495 #ifndef MACHINE_SIMPLE_LOCK
 496         DECL_PC(pc);
 497
 498         OBTAIN_PC(pc);
 499         USLDBG(usld_unlock(l, pc));
 500 #if DEVELOPMENT || DEBUG
 501                 pltrace(TRUE);
 502 #endif
 503         hw_lock_unlock(&l->interlock);
 504 #else
 505         simple_unlock_rwmb((simple_lock_t)l);
 506 #endif
 507 }
 508
 509
 510 /*
 511  *      Conditionally acquire a usimple_lock.
 512  *
 513  *      On success, returns with preemption disabled.
 514  *      On failure, returns with preemption in the same state
 515  *      as when first invoked.  Note that the hw_lock routines
 516  *      are responsible for maintaining preemption state.
 517  *
 518  *      XXX No stats are gathered on a miss; I preserved this
 519  *      behavior from the original assembly-language code, but
 520  *      doesn't it make sense to log misses?  XXX
 521  */
 522 unsigned int
 523 usimple_lock_try(
 524         usimple_lock_t  l)
 525 {
 526 #ifndef MACHINE_SIMPLE_LOCK
 527         unsigned int    success;
 528         DECL_PC(pc);
 529
 530         OBTAIN_PC(pc);
 531         USLDBG(usld_lock_try_pre(l, pc));
 532         if ((success = hw_lock_try(&l->interlock))) {
 533 #if DEVELOPMENT || DEBUG
 534                 pltrace(FALSE);
 535 #endif
 536         USLDBG(usld_lock_try_post(l, pc));
 537         }
 538         return success;
 539 #else
 540         return(simple_lock_try((simple_lock_t)l));
 541 #endif
 542 }
 543
 544 /*
 545  * Acquire a usimple_lock while polling for pending TLB flushes
 546  * and spinning on a lock.
 547  *
 548  */
 549 void
 550 usimple_lock_try_lock_loop(usimple_lock_t l)
 551 {
 552         boolean_t istate = ml_get_interrupts_enabled();
 553         while (!simple_lock_try((l))) {
 554                 if (!istate)
 555                         handle_pending_TLB_flushes();
 556                 cpu_pause();
 557         }
 558 }
 559
 560 #if     USLOCK_DEBUG
 561 /*
 562  *      States of a usimple_lock.  The default when initializing
 563  *      a usimple_lock is setting it up for debug checking.
 564  */
 565 #define USLOCK_CHECKED          0x0001          /* lock is being checked */
 566 #define USLOCK_TAKEN            0x0002          /* lock has been taken */
 567 #define USLOCK_INIT             0xBAA0          /* lock has been initialized */
 568 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 569 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 570                                  ((l)->debug.state & USLOCK_CHECKED))
 571
 572 /*
 573  *      Trace activities of a particularly interesting lock.
 574  */
 575 void    usl_trace(usimple_lock_t, int, pc_t, const char *);
 576
 577
 578 /*
 579  *      Initialize the debugging information contained
 580  *      in a usimple_lock.
 581  */
 582 void
 583 usld_lock_init(
 584         usimple_lock_t  l,
 585         __unused unsigned short tag)
 586 {
 587         if (l == USIMPLE_LOCK_NULL)
 588                 panic("lock initialization:  null lock pointer");
 589         l->lock_type = USLOCK_TAG;
 590         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 591         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 592         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 593         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 594         l->debug.duration[0] = l->debug.duration[1] = 0;
 595         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 596         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 597         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 598 }
 599
 600
 601 /*
 602  *      These checks apply to all usimple_locks, not just
 603  *      those with USLOCK_CHECKED turned on.
 604  */
 605 int
 606 usld_lock_common_checks(
 607         usimple_lock_t  l,
 608         char            *caller)
 609 {
 610         if (l == USIMPLE_LOCK_NULL)
 611                 panic("%s:  null lock pointer", caller);
 612         if (l->lock_type != USLOCK_TAG)
 613                 panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
 614         if (!(l->debug.state & USLOCK_INIT))
 615                 panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
 616         return USLOCK_CHECKING(l);
 617 }
 618
 619
 620 /*
 621  *      Debug checks on a usimple_lock just before attempting
 622  *      to acquire it.
 623  */
 624 /* ARGSUSED */
 625 void
 626 usld_lock_pre(
 627         usimple_lock_t  l,
 628         pc_t            pc)
 629 {
 630         char    caller[] = "usimple_lock";
 631
 632
 633         if (!usld_lock_common_checks(l, caller))
 634                 return;
 635
 636 /*
 637  *      Note that we have a weird case where we are getting a lock when we are]
 638  *      in the process of putting the system to sleep. We are running with no
 639  *      current threads, therefore we can't tell if we are trying to retake a lock
 640  *      we have or someone on the other processor has it.  Therefore we just
 641  *      ignore this test if the locking thread is 0.
 642  */
 643
 644         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 645             l->debug.lock_thread == (void *) current_thread()) {
 646                 printf("%s:  lock %p already locked (at %p) by",
 647                       caller, l, l->debug.lock_pc);
 648                 printf(" current thread %p (new attempt at pc %p)\n",
 649                        l->debug.lock_thread, pc);
 650                 panic("%s", caller);
 651         }
 652         mp_disable_preemption();
 653         usl_trace(l, cpu_number(), pc, caller);
 654         mp_enable_preemption();
 655 }
 656
 657
 658 /*
 659  *      Debug checks on a usimple_lock just after acquiring it.
 660  *
 661  *      Pre-emption has been disabled at this point,
 662  *      so we are safe in using cpu_number.
 663  */
 664 void
 665 usld_lock_post(
 666         usimple_lock_t  l,
 667         pc_t            pc)
 668 {
 669         int     mycpu;
 670         char    caller[] = "successful usimple_lock";
 671
 672
 673         if (!usld_lock_common_checks(l, caller))
 674                 return;
 675
 676         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 677                 panic("%s:  lock %p became uninitialized",
 678                       caller, l);
 679         if ((l->debug.state & USLOCK_TAKEN))
 680                 panic("%s:  lock 0x%p became TAKEN by someone else",
 681                       caller, l);
 682
 683         mycpu = cpu_number();
 684         l->debug.lock_thread = (void *)current_thread();
 685         l->debug.state |= USLOCK_TAKEN;
 686         l->debug.lock_pc = pc;
 687         l->debug.lock_cpu = mycpu;
 688
 689         usl_trace(l, mycpu, pc, caller);
 690 }
 691
 692
 693 /*
 694  *      Debug checks on a usimple_lock just before
 695  *      releasing it.  Note that the caller has not
 696  *      yet released the hardware lock.
 697  *
 698  *      Preemption is still disabled, so there's
 699  *      no problem using cpu_number.
 700  */
 701 void
 702 usld_unlock(
 703         usimple_lock_t  l,
 704         pc_t            pc)
 705 {
 706         int     mycpu;
 707         char    caller[] = "usimple_unlock";
 708
 709
 710         if (!usld_lock_common_checks(l, caller))
 711                 return;
 712
 713         mycpu = cpu_number();
 714
 715         if (!(l->debug.state & USLOCK_TAKEN))
 716                 panic("%s:  lock 0x%p hasn't been taken",
 717                       caller, l);
 718         if (l->debug.lock_thread != (void *) current_thread())
 719                 panic("%s:  unlocking lock 0x%p, owned by thread %p",
 720                       caller, l, l->debug.lock_thread);
 721         if (l->debug.lock_cpu != mycpu) {
 722                 printf("%s:  unlocking lock 0x%p on cpu 0x%x",
 723                        caller, l, mycpu);
 724                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 725                 panic("%s", caller);
 726         }
 727         usl_trace(l, mycpu, pc, caller);
 728
 729         l->debug.unlock_thread = l->debug.lock_thread;
 730         l->debug.lock_thread = INVALID_PC;
 731         l->debug.state &= ~USLOCK_TAKEN;
 732         l->debug.unlock_pc = pc;
 733         l->debug.unlock_cpu = mycpu;
 734 }
 735
 736
 737 /*
 738  *      Debug checks on a usimple_lock just before
 739  *      attempting to acquire it.
 740  *
 741  *      Preemption isn't guaranteed to be disabled.
 742  */
 743 void
 744 usld_lock_try_pre(
 745         usimple_lock_t  l,
 746         pc_t            pc)
 747 {
 748         char    caller[] = "usimple_lock_try";
 749
 750         if (!usld_lock_common_checks(l, caller))
 751                 return;
 752         mp_disable_preemption();
 753         usl_trace(l, cpu_number(), pc, caller);
 754         mp_enable_preemption();
 755 }
 756
 757
 758 /*
 759  *      Debug checks on a usimple_lock just after
 760  *      successfully attempting to acquire it.
 761  *
 762  *      Preemption has been disabled by the
 763  *      lock acquisition attempt, so it's safe
 764  *      to use cpu_number.
 765  */
 766 void
 767 usld_lock_try_post(
 768         usimple_lock_t  l,
 769         pc_t            pc)
 770 {
 771         int     mycpu;
 772         char    caller[] = "successful usimple_lock_try";
 773
 774         if (!usld_lock_common_checks(l, caller))
 775                 return;
 776
 777         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 778                 panic("%s:  lock 0x%p became uninitialized",
 779                       caller, l);
 780         if ((l->debug.state & USLOCK_TAKEN))
 781                 panic("%s:  lock 0x%p became TAKEN by someone else",
 782                       caller, l);
 783
 784         mycpu = cpu_number();
 785         l->debug.lock_thread = (void *) current_thread();
 786         l->debug.state |= USLOCK_TAKEN;
 787         l->debug.lock_pc = pc;
 788         l->debug.lock_cpu = mycpu;
 789
 790         usl_trace(l, mycpu, pc, caller);
 791 }
 792
 793
 794 /*
 795  *      For very special cases, set traced_lock to point to a
 796  *      specific lock of interest.  The result is a series of
 797  *      XPRs showing lock operations on that lock.  The lock_seq
 798  *      value is used to show the order of those operations.
 799  */
 800 usimple_lock_t          traced_lock;
 801 unsigned int            lock_seq;
 802
 803 void
 804 usl_trace(
 805         usimple_lock_t  l,
 806         int             mycpu,
 807         pc_t            pc,
 808         const char *    op_name)
 809 {
 810         if (traced_lock == l) {
 811                 XPR(XPR_SLOCK,
 812                     "seq %d, cpu %d, %s @ %x\n",
 813                     (uintptr_t) lock_seq, (uintptr_t) mycpu,
 814                     (uintptr_t) op_name, (uintptr_t) pc, 0);
 815                 lock_seq++;
 816         }
 817 }
 818
 819
 820 #endif  /* USLOCK_DEBUG */
 821
 822 /*
 823  *      Routine:        lck_rw_alloc_init
 824  */
 825 lck_rw_t *
 826 lck_rw_alloc_init(
 827         lck_grp_t       *grp,
 828         lck_attr_t      *attr) {
 829         lck_rw_t        *lck;
 830
 831         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 832                 bzero(lck, sizeof(lck_rw_t));
 833                 lck_rw_init(lck, grp, attr);
 834         }
 835
 836         return(lck);
 837 }
 838
 839 /*
 840  *      Routine:        lck_rw_free
 841  */
 842 void
 843 lck_rw_free(
 844         lck_rw_t        *lck,
 845         lck_grp_t       *grp) {
 846         lck_rw_destroy(lck, grp);
 847         kfree(lck, sizeof(lck_rw_t));
 848 }
 849
 850 /*
 851  *      Routine:        lck_rw_init
 852  */
 853 void
 854 lck_rw_init(
 855         lck_rw_t        *lck,
 856         lck_grp_t       *grp,
 857         lck_attr_t      *attr)
 858 {
 859         lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
 860                                         attr : &LockDefaultLckAttr;
 861
 862         hw_lock_byte_init(&lck->lck_rw_interlock);
 863         lck->lck_rw_want_write = FALSE;
 864         lck->lck_rw_want_upgrade = FALSE;
 865         lck->lck_rw_shared_count = 0;
 866         lck->lck_rw_can_sleep = TRUE;
 867         lck->lck_r_waiting = lck->lck_w_waiting = 0;
 868         lck->lck_rw_tag = 0;
 869         lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
 870                                 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
 871
 872         lck_grp_reference(grp);
 873         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 874 }
 875
 876 /*
 877  *      Routine:        lck_rw_destroy
 878  */
 879 void
 880 lck_rw_destroy(
 881         lck_rw_t        *lck,
 882         lck_grp_t       *grp)
 883 {
 884         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
 885                 return;
 886 #if MACH_LDEBUG
 887         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 888 #endif
 889         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 890         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 891         lck_grp_deallocate(grp);
 892         return;
 893 }
 894
 895 /*
 896  *      Sleep locks.  These use the same data structure and algorithm
 897  *      as the spin locks, but the process sleeps while it is waiting
 898  *      for the lock.  These work on uniprocessor systems.
 899  */
 900
 901 #define DECREMENTER_TIMEOUT 1000000
 902
 903 /*
 904  * We disable interrupts while holding the RW interlock to prevent an
 905  * interrupt from exacerbating hold time.
 906  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 907  */
 908 static inline boolean_t
 909 lck_interlock_lock(lck_rw_t *lck)
 910 {
 911         boolean_t       istate;
 912
 913         istate = ml_set_interrupts_enabled(FALSE);
 914         hw_lock_byte_lock(&lck->lck_rw_interlock);
 915         return istate;
 916 }
 917
 918 static inline void
 919 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 920 {
 921         hw_lock_byte_unlock(&lck->lck_rw_interlock);
 922         ml_set_interrupts_enabled(istate);
 923 }
 924
 925 /*
 926  * This inline is used when busy-waiting for an rw lock.
 927  * If interrupts were disabled when the lock primitive was called,
 928  * we poll the IPI handler for pending tlb flushes.
 929  * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
 930  */
 931 static inline void
 932 lck_rw_lock_pause(boolean_t interrupts_enabled)
 933 {
 934         if (!interrupts_enabled)
 935                 handle_pending_TLB_flushes();
 936         cpu_pause();
 937 }
 938
 939 static inline boolean_t
 940 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
 941 {
 942         if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE))
 943                 return TRUE;
 944         return FALSE;
 945 }
 946
 947 /*
 948  * compute the deadline to spin against when
 949  * waiting for a change of state on a lck_rw_t
 950  */
 951 static inline uint64_t
 952 lck_rw_deadline_for_spin(lck_rw_t *lck)
 953 {
 954         if (lck->lck_rw_can_sleep) {
 955                 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
 956                         /*
 957                          * there are already threads waiting on this lock... this
 958                          * implies that they have spun beyond their deadlines waiting for
 959                          * the desired state to show up so we will not bother spinning at this time...
 960                          *   or
 961                          * the current number of threads sharing this lock exceeds our capacity to run them
 962                          * concurrently and since all states we're going to spin for require the rw_shared_count
 963                          * to be at 0, we'll not bother spinning since the latency for this to happen is
 964                          * unpredictable...
 965                          */
 966                         return (mach_absolute_time());
 967                 }
 968                 return (mach_absolute_time() + MutexSpin);
 969         } else
 970                 return (mach_absolute_time() + (100000LL * 1000000000LL));
 971 }
 972
 973
 974 /*
 975  * Spin while interlock is held.
 976  */
 977
 978 static inline void
 979 lck_rw_interlock_spin(lck_rw_t *lock)
 980 {
 981         while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
 982                 cpu_pause();
 983         }
 984 }
 985
 986 static boolean_t
 987 lck_rw_grab_want(lck_rw_t *lock)
 988 {
 989         uint32_t        data, prev;
 990
 991         for ( ; ; ) {
 992                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
 993                 if ((data & LCK_RW_INTERLOCK) == 0)
 994                         break;
 995                 atomic_exchange_abort();
 996                 lck_rw_interlock_spin(lock);
 997         }
 998         if (data & LCK_RW_WANT_WRITE) {
 999                 atomic_exchange_abort();
1000                 return FALSE;
1001         }
1002         data |= LCK_RW_WANT_WRITE;
1003         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1004 }
1005
1006 static boolean_t
1007 lck_rw_grab_shared(lck_rw_t *lock)
1008 {
1009         uint32_t        data, prev;
1010
1011         for ( ; ; ) {
1012                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1013                 if ((data & LCK_RW_INTERLOCK) == 0)
1014                         break;
1015                 atomic_exchange_abort();
1016                 lck_rw_interlock_spin(lock);
1017         }
1018         if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1019                 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1020                         atomic_exchange_abort();
1021                         return FALSE;
1022                 }
1023         }
1024         data += LCK_RW_SHARED_READER;
1025         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1026 }
1027
1028 /*
1029  *      Routine:        lck_rw_lock_exclusive
1030  */
1031 static void
1032 lck_rw_lock_exclusive_gen(
1033         lck_rw_t        *lck)
1034 {
1035         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1036         uint64_t        deadline = 0;
1037         int             slept = 0;
1038         int             gotlock = 0;
1039         int             lockheld = 0;
1040         wait_result_t   res = 0;
1041         boolean_t       istate = -1;
1042
1043 #if     CONFIG_DTRACE
1044         boolean_t dtrace_ls_initialized = FALSE;
1045         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
1046         uint64_t wait_interval = 0;
1047         int readers_at_sleep = 0;
1048 #endif
1049
1050         /*
1051          *      Try to acquire the lck_rw_want_write bit.
1052          */
1053         while ( !lck_rw_grab_want(lck)) {
1054
1055 #if     CONFIG_DTRACE
1056                 if (dtrace_ls_initialized == FALSE) {
1057                         dtrace_ls_initialized = TRUE;
1058                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1059                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1060                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1061                         if (dtrace_ls_enabled) {
1062                                 /*
1063                                  * Either sleeping or spinning is happening,
1064                                  *  start a timing of our delay interval now.
1065                                  */
1066                                 readers_at_sleep = lck->lck_rw_shared_count;
1067                                 wait_interval = mach_absolute_time();
1068                         }
1069                 }
1070 #endif
1071                 if (istate == -1)
1072                         istate = ml_get_interrupts_enabled();
1073
1074                 deadline = lck_rw_deadline_for_spin(lck);
1075
1076                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1077
1078                 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
1079                         lck_rw_lock_pause(istate);
1080
1081                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1082
1083                 if (gotlock)
1084                         break;
1085                 /*
1086                  * if we get here, the deadline has expired w/o us
1087                  * being able to grab the lock exclusively
1088                  * check to see if we're allowed to do a thread_block
1089                  */
1090                 if (lck->lck_rw_can_sleep) {
1091
1092                         istate = lck_interlock_lock(lck);
1093
1094                         if (lck->lck_rw_want_write) {
1095
1096                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1097
1098                                 lck->lck_w_waiting = TRUE;
1099
1100                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1101                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1102                                                 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1103                                 lck_interlock_unlock(lck, istate);
1104
1105                                 if (res == THREAD_WAITING) {
1106                                         res = thread_block(THREAD_CONTINUE_NULL);
1107                                         slept++;
1108                                 }
1109                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1110                         } else {
1111                                 lck->lck_rw_want_write = TRUE;
1112                                 lck_interlock_unlock(lck, istate);
1113                                 break;
1114                         }
1115                 }
1116         }
1117         /*
1118          * Wait for readers (and upgrades) to finish...
1119          * the test for these conditions must be done simultaneously with
1120          * a check of the interlock not being held since
1121          * the rw_shared_count will drop to 0 first and then want_upgrade
1122          * will be set to 1 in the shared_to_exclusive scenario... those
1123          * adjustments are done behind the interlock and represent an
1124          * atomic change in state and must be considered as such
1125          * however, once we see the read count at 0, the want_upgrade not set
1126          * and the interlock not held, we are safe to proceed
1127          */
1128         while (lck_rw_held_read_or_upgrade(lck)) {
1129
1130 #if     CONFIG_DTRACE
1131                 /*
1132                  * Either sleeping or spinning is happening, start
1133                  * a timing of our delay interval now.  If we set it
1134                  * to -1 we don't have accurate data so we cannot later
1135                  * decide to record a dtrace spin or sleep event.
1136                  */
1137                 if (dtrace_ls_initialized == FALSE) {
1138                         dtrace_ls_initialized = TRUE;
1139                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1140                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1141                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1142                         if (dtrace_ls_enabled) {
1143                                 /*
1144                                  * Either sleeping or spinning is happening,
1145                                  *  start a timing of our delay interval now.
1146                                  */
1147                                 readers_at_sleep = lck->lck_rw_shared_count;
1148                                 wait_interval = mach_absolute_time();
1149                         }
1150                 }
1151 #endif
1152                 if (istate == -1)
1153                         istate = ml_get_interrupts_enabled();
1154
1155                 deadline = lck_rw_deadline_for_spin(lck);
1156
1157                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1158
1159                 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1160                         lck_rw_lock_pause(istate);
1161
1162                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1163
1164                 if ( !lockheld)
1165                         break;
1166                 /*
1167                  * if we get here, the deadline has expired w/o us
1168                  * being able to grab the lock exclusively
1169                  * check to see if we're allowed to do a thread_block
1170                  */
1171                 if (lck->lck_rw_can_sleep) {
1172
1173                         istate = lck_interlock_lock(lck);
1174
1175                         if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1176                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1177
1178                                 lck->lck_w_waiting = TRUE;
1179
1180                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1181                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1182                                                 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1183                                 lck_interlock_unlock(lck, istate);
1184
1185                                 if (res == THREAD_WAITING) {
1186                                         res = thread_block(THREAD_CONTINUE_NULL);
1187                                         slept++;
1188                                 }
1189                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1190                         } else {
1191                                 lck_interlock_unlock(lck, istate);
1192                                 /*
1193                                  * must own the lock now, since we checked for
1194                                  * readers or upgrade owner behind the interlock
1195                                  * no need for a call to 'lck_rw_held_read_or_upgrade'
1196                                  */
1197                                 break;
1198                         }
1199                 }
1200         }
1201
1202 #if     CONFIG_DTRACE
1203         /*
1204          * Decide what latencies we suffered that are Dtrace events.
1205          * If we have set wait_interval, then we either spun or slept.
1206          * At least we get out from under the interlock before we record
1207          * which is the best we can do here to minimize the impact
1208          * of the tracing.
1209          * If we have set wait_interval to -1, then dtrace was not enabled when we
1210          * started sleeping/spinning so we don't record this event.
1211          */
1212         if (dtrace_ls_enabled == TRUE) {
1213                 if (slept == 0) {
1214                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1215                             mach_absolute_time() - wait_interval, 1);
1216                 } else {
1217                         /*
1218                          * For the blocking case, we also record if when we blocked
1219                          * it was held for read or write, and how many readers.
1220                          * Notice that above we recorded this before we dropped
1221                          * the interlock so the count is accurate.
1222                          */
1223                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1224                             mach_absolute_time() - wait_interval, 1,
1225                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1226                 }
1227         }
1228         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1229 #endif
1230 }
1231
1232 /*
1233  *      Routine:        lck_rw_done
1234  */
1235
1236 lck_rw_type_t lck_rw_done(lck_rw_t *lock)
1237 {
1238         uint32_t        data, prev;
1239
1240         for ( ; ; ) {
1241                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1242                 if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
1243                         atomic_exchange_abort();
1244                         lck_rw_interlock_spin(lock);
1245                         continue;
1246                 }
1247                 if (data & LCK_RW_SHARED_MASK) {
1248                         data -= LCK_RW_SHARED_READER;
1249                         if ((data & LCK_RW_SHARED_MASK) == 0)   /* if reader count has now gone to 0, check for waiters */
1250                                 goto check_waiters;
1251                 } else {                                        /* if reader count == 0, must be exclusive lock */
1252                         if (data & LCK_RW_WANT_UPGRADE) {
1253                                 data &= ~(LCK_RW_WANT_UPGRADE);
1254                         } else {
1255                                 if (data & LCK_RW_WANT_WRITE)
1256                                         data &= ~(LCK_RW_WANT_EXCL);
1257                                 else                                    /* lock is not 'owned', panic */
1258                                         panic("Releasing non-exclusive RW lock without a reader refcount!");
1259                         }
1260 check_waiters:
1261                         if (prev & LCK_RW_W_WAITING) {
1262                                 data &= ~(LCK_RW_W_WAITING);
1263                                 if ((prev & LCK_RW_PRIV_EXCL) == 0)
1264                                         data &= ~(LCK_RW_R_WAITING);
1265                         } else
1266                                 data &= ~(LCK_RW_R_WAITING);
1267                 }
1268                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1269                         break;
1270                 cpu_pause();
1271         }
1272         return lck_rw_done_gen(lock, prev);
1273 }
1274
1275 /*
1276  *      Routine:        lck_rw_done_gen
1277  *
1278  *      called from lck_rw_done()
1279  *      prior_lock_state is the value in the 1st
1280  *      word of the lock at the time of a successful
1281  *      atomic compare and exchange with the new value...
1282  *      it represents the state of the lock before we
1283  *      decremented the rw_shared_count or cleared either
1284  *      rw_want_upgrade or rw_want_write and
1285  *      the lck_x_waiting bits...  since the wrapper
1286  *      routine has already changed the state atomically,
1287  *      we just need to decide if we should
1288  *      wake up anyone and what value to return... we do
1289  *      this by examining the state of the lock before
1290  *      we changed it
1291  */
1292 static lck_rw_type_t
1293 lck_rw_done_gen(
1294         lck_rw_t        *lck,
1295         uint32_t        prior_lock_state)
1296 {
1297         lck_rw_t        *fake_lck;
1298         lck_rw_type_t   lock_type;
1299         thread_t        thread;
1300         uint32_t        rwlock_count;
1301
1302         /*
1303          * prior_lock state is a snapshot of the 1st word of the
1304          * lock in question... we'll fake up a pointer to it
1305          * and carefully not access anything beyond whats defined
1306          * in the first word of a lck_rw_t
1307          */
1308         fake_lck = (lck_rw_t *)&prior_lock_state;
1309
1310         if (fake_lck->lck_rw_shared_count <= 1) {
1311                 if (fake_lck->lck_w_waiting)
1312                         thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1313
1314                 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1315                         thread_wakeup(RW_LOCK_READER_EVENT(lck));
1316         }
1317         if (fake_lck->lck_rw_shared_count)
1318                 lock_type = LCK_RW_TYPE_SHARED;
1319         else
1320                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1321
1322         /* Check if dropping the lock means that we need to unpromote */
1323         thread = current_thread();
1324         rwlock_count = thread->rwlock_count--;
1325 #if MACH_LDEBUG
1326         if (rwlock_count == 0) {
1327                 panic("rw lock count underflow for thread %p", thread);
1328         }
1329 #endif
1330         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1331                 /* sched_flags checked without lock, but will be rechecked while clearing */
1332                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1333         }
1334
1335 #if CONFIG_DTRACE
1336         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1337 #endif
1338
1339         return(lock_type);
1340 }
1341
1342
1343 /*
1344  *      Routine:        lck_rw_unlock
1345  */
1346 void
1347 lck_rw_unlock(
1348         lck_rw_t        *lck,
1349         lck_rw_type_t   lck_rw_type)
1350 {
1351         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1352                 lck_rw_unlock_shared(lck);
1353         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1354                 lck_rw_unlock_exclusive(lck);
1355         else
1356                 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1357 }
1358
1359
1360 /*
1361  *      Routine:        lck_rw_unlock_shared
1362  */
1363 void
1364 lck_rw_unlock_shared(
1365         lck_rw_t        *lck)
1366 {
1367         lck_rw_type_t   ret;
1368
1369         assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1370         ret = lck_rw_done(lck);
1371
1372         if (ret != LCK_RW_TYPE_SHARED)
1373                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1374 }
1375
1376
1377 /*
1378  *      Routine:        lck_rw_unlock_exclusive
1379  */
1380 void
1381 lck_rw_unlock_exclusive(
1382         lck_rw_t        *lck)
1383 {
1384         lck_rw_type_t   ret;
1385
1386         ret = lck_rw_done(lck);
1387
1388         if (ret != LCK_RW_TYPE_EXCLUSIVE)
1389                 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1390 }
1391
1392
1393 /*
1394  *      Routine:        lck_rw_lock
1395  */
1396 void
1397 lck_rw_lock(
1398         lck_rw_t        *lck,
1399         lck_rw_type_t   lck_rw_type)
1400 {
1401         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1402                 lck_rw_lock_shared(lck);
1403         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1404                 lck_rw_lock_exclusive(lck);
1405         else
1406                 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1407 }
1408
1409 /*
1410  *      Routine:        lck_rw_lock_shared
1411  */
1412 void
1413 lck_rw_lock_shared(lck_rw_t *lock)
1414 {
1415         uint32_t        data, prev;
1416
1417         current_thread()->rwlock_count++;
1418         for ( ; ; ) {
1419                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1420                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1421                         atomic_exchange_abort();
1422                         lck_rw_lock_shared_gen(lock);
1423                         break;
1424                 }
1425                 data += LCK_RW_SHARED_READER;
1426                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1427                         break;
1428                 cpu_pause();
1429         }
1430 #if     CONFIG_DTRACE
1431         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1432 #endif  /* CONFIG_DTRACE */
1433         return;
1434 }
1435
1436 /*
1437  *      Routine:        lck_rw_lock_shared_gen
1438  *      Function:
1439  *              assembly fast path code has determined that this lock
1440  *              is held exclusively... this is where we spin/block
1441  *              until we can acquire the lock in the shared mode
1442  */
1443 static void
1444 lck_rw_lock_shared_gen(
1445         lck_rw_t        *lck)
1446 {
1447         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1448         uint64_t        deadline = 0;
1449         int             gotlock = 0;
1450         int             slept = 0;
1451         wait_result_t   res = 0;
1452         boolean_t       istate = -1;
1453
1454 #if     CONFIG_DTRACE
1455         uint64_t wait_interval = 0;
1456         int readers_at_sleep = 0;
1457         boolean_t dtrace_ls_initialized = FALSE;
1458         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1459 #endif
1460
1461         while ( !lck_rw_grab_shared(lck)) {
1462
1463 #if     CONFIG_DTRACE
1464                 if (dtrace_ls_initialized == FALSE) {
1465                         dtrace_ls_initialized = TRUE;
1466                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1467                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1468                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1469                         if (dtrace_ls_enabled) {
1470                                 /*
1471                                  * Either sleeping or spinning is happening,
1472                                  *  start a timing of our delay interval now.
1473                                  */
1474                                 readers_at_sleep = lck->lck_rw_shared_count;
1475                                 wait_interval = mach_absolute_time();
1476                         }
1477                 }
1478 #endif
1479                 if (istate == -1)
1480                         istate = ml_get_interrupts_enabled();
1481
1482                 deadline = lck_rw_deadline_for_spin(lck);
1483
1484                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1485                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1486
1487                 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1488                         lck_rw_lock_pause(istate);
1489
1490                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1491                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1492
1493                 if (gotlock)
1494                         break;
1495                 /*
1496                  * if we get here, the deadline has expired w/o us
1497                  * being able to grab the lock for read
1498                  * check to see if we're allowed to do a thread_block
1499                  */
1500                 if (lck->lck_rw_can_sleep) {
1501
1502                         istate = lck_interlock_lock(lck);
1503
1504                         if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1505                             ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1506
1507                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1508                                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1509
1510                                 lck->lck_r_waiting = TRUE;
1511
1512                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1513                                 res = assert_wait(RW_LOCK_READER_EVENT(lck),
1514                                                 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1515                                 lck_interlock_unlock(lck, istate);
1516
1517                                 if (res == THREAD_WAITING) {
1518                                         res = thread_block(THREAD_CONTINUE_NULL);
1519                                         slept++;
1520                                 }
1521                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1522                                              trace_lck, res, slept, 0, 0);
1523                         } else {
1524                                 lck->lck_rw_shared_count++;
1525                                 lck_interlock_unlock(lck, istate);
1526                                 break;
1527                         }
1528                 }
1529         }
1530
1531 #if     CONFIG_DTRACE
1532         if (dtrace_ls_enabled == TRUE) {
1533                 if (slept == 0) {
1534                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1535                 } else {
1536                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1537                             mach_absolute_time() - wait_interval, 0,
1538                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1539                 }
1540         }
1541         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1542 #endif
1543 }
1544
1545
1546 /*
1547  *      Routine:        lck_rw_lock_exclusive
1548  */
1549
1550 void
1551 lck_rw_lock_exclusive(lck_rw_t *lock)
1552 {
1553         current_thread()->rwlock_count++;
1554         if (atomic_test_and_set32(&lock->data,
1555                 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1556                 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1557 #if     CONFIG_DTRACE
1558                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1559 #endif  /* CONFIG_DTRACE */
1560         } else
1561                 lck_rw_lock_exclusive_gen(lock);
1562 }
1563
1564
1565 /*
1566  *      Routine:        lck_rw_lock_shared_to_exclusive
1567  */
1568
1569 boolean_t
1570 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1571 {
1572         uint32_t        data, prev;
1573
1574         for ( ; ; ) {
1575                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1576                 if (data & LCK_RW_INTERLOCK) {
1577                         atomic_exchange_abort();
1578                         lck_rw_interlock_spin(lock);
1579                         continue;
1580                 }
1581                 if (data & LCK_RW_WANT_UPGRADE) {
1582                         data -= LCK_RW_SHARED_READER;
1583                         if ((data & LCK_RW_SHARED_MASK) == 0)           /* we were the last reader */
1584                                 data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1585                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1586                                 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1587                 } else {
1588                         data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1589                         data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1590                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1591                                 break;
1592                 }
1593                 cpu_pause();
1594         }
1595                                                 /* we now own the WANT_UPGRADE */
1596         if (data & LCK_RW_SHARED_MASK)          /* check to see if all of the readers are drained */
1597                 lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1598 #if     CONFIG_DTRACE
1599         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1600 #endif
1601         return TRUE;
1602 }
1603
1604
1605 /*
1606  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1607  *      Function:
1608  *              assembly fast path code has already dropped our read
1609  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1610  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1611  *              all we need to do here is determine if a wakeup is needed
1612  */
1613 static boolean_t
1614 lck_rw_lock_shared_to_exclusive_failure(
1615         lck_rw_t        *lck,
1616         uint32_t        prior_lock_state)
1617 {
1618         lck_rw_t        *fake_lck;
1619         thread_t        thread = current_thread();
1620         uint32_t        rwlock_count;
1621
1622         /* Check if dropping the lock means that we need to unpromote */
1623         rwlock_count = thread->rwlock_count--;
1624 #if MACH_LDEBUG
1625         if (rwlock_count == 0) {
1626                 panic("rw lock count underflow for thread %p", thread);
1627         }
1628 #endif
1629         fake_lck = (lck_rw_t *)&prior_lock_state;
1630
1631         if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1632                 /*
1633                  *      Someone else has requested upgrade.
1634                  *      Since we've released the read lock, wake
1635                  *      him up if he's blocked waiting
1636                  */
1637                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1638         }
1639
1640         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1641                 /* sched_flags checked without lock, but will be rechecked while clearing */
1642                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1643         }
1644
1645         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1646                      VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1647
1648         return (FALSE);
1649 }
1650
1651
1652 /*
1653  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1654  *      Function:
1655  *              assembly fast path code has already dropped our read
1656  *              count and successfully acquired 'lck_rw_want_upgrade'
1657  *              we just need to wait for the rest of the readers to drain
1658  *              and then we can return as the exclusive holder of this lock
1659  */
1660 static boolean_t
1661 lck_rw_lock_shared_to_exclusive_success(
1662         lck_rw_t        *lck)
1663 {
1664         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1665         uint64_t        deadline = 0;
1666         int             slept = 0;
1667         int             still_shared = 0;
1668         wait_result_t   res;
1669         boolean_t       istate = -1;
1670
1671 #if     CONFIG_DTRACE
1672         uint64_t wait_interval = 0;
1673         int readers_at_sleep = 0;
1674         boolean_t dtrace_ls_initialized = FALSE;
1675         boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1676 #endif
1677
1678         while (lck->lck_rw_shared_count != 0) {
1679
1680 #if     CONFIG_DTRACE
1681                 if (dtrace_ls_initialized == FALSE) {
1682                         dtrace_ls_initialized = TRUE;
1683                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1684                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1685                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1686                         if (dtrace_ls_enabled) {
1687                                 /*
1688                                  * Either sleeping or spinning is happening,
1689                                  *  start a timing of our delay interval now.
1690                                  */
1691                                 readers_at_sleep = lck->lck_rw_shared_count;
1692                                 wait_interval = mach_absolute_time();
1693                         }
1694                 }
1695 #endif
1696                 if (istate == -1)
1697                         istate = ml_get_interrupts_enabled();
1698
1699                 deadline = lck_rw_deadline_for_spin(lck);
1700
1701                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1702                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1703
1704                 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1705                         lck_rw_lock_pause(istate);
1706
1707                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1708                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1709
1710                 if ( !still_shared)
1711                         break;
1712                 /*
1713                  * if we get here, the deadline has expired w/o
1714                  * the rw_shared_count having drained to 0
1715                  * check to see if we're allowed to do a thread_block
1716                  */
1717                 if (lck->lck_rw_can_sleep) {
1718
1719                         istate = lck_interlock_lock(lck);
1720
1721                         if (lck->lck_rw_shared_count != 0) {
1722                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1723                                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1724
1725                                 lck->lck_w_waiting = TRUE;
1726
1727                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1728                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1729                                                 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1730                                 lck_interlock_unlock(lck, istate);
1731
1732                                 if (res == THREAD_WAITING) {
1733                                         res = thread_block(THREAD_CONTINUE_NULL);
1734                                         slept++;
1735                                 }
1736                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1737                                              trace_lck, res, slept, 0, 0);
1738                         } else {
1739                                 lck_interlock_unlock(lck, istate);
1740                                 break;
1741                         }
1742                 }
1743         }
1744 #if     CONFIG_DTRACE
1745         /*
1746          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1747          */
1748         if (dtrace_ls_enabled == TRUE) {
1749                 if (slept == 0) {
1750                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1751                 } else {
1752                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1753                             mach_absolute_time() - wait_interval, 1,
1754                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1755                 }
1756         }
1757         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1758 #endif
1759         return (TRUE);
1760 }
1761
1762 /*
1763  *      Routine:        lck_rw_lock_exclusive_to_shared
1764  */
1765
1766 void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1767 {
1768         uint32_t        data, prev;
1769
1770         for ( ; ; ) {
1771                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1772                 if (data & LCK_RW_INTERLOCK) {
1773                         atomic_exchange_abort();
1774                         lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1775                         continue;
1776                 }
1777                 data += LCK_RW_SHARED_READER;
1778                 if (data & LCK_RW_WANT_UPGRADE)
1779                         data &= ~(LCK_RW_WANT_UPGRADE);
1780                 else
1781                         data &= ~(LCK_RW_WANT_EXCL);
1782                 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL)))
1783                         data &= ~(LCK_RW_W_WAITING);
1784                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1785                         break;
1786                 cpu_pause();
1787         }
1788         return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1789 }
1790
1791
1792 /*
1793  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1794  *      Function:
1795  *              assembly fast path has already dropped
1796  *              our exclusive state and bumped lck_rw_shared_count
1797  *              all we need to do here is determine if anyone
1798  *              needs to be awakened.
1799  */
1800 static void
1801 lck_rw_lock_exclusive_to_shared_gen(
1802         lck_rw_t        *lck,
1803         uint32_t        prior_lock_state)
1804 {
1805         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1806         lck_rw_t                *fake_lck;
1807
1808         fake_lck = (lck_rw_t *)&prior_lock_state;
1809
1810         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1811                              trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1812
1813         /*
1814          * don't wake up anyone waiting to take the lock exclusively
1815          * since we hold a read count... when the read count drops to 0,
1816          * the writers will be woken.
1817          *
1818          * wake up any waiting readers if we don't have any writers waiting,
1819          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1820          */
1821         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1822                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1823
1824         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1825                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1826
1827 #if CONFIG_DTRACE
1828         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1829 #endif
1830 }
1831
1832
1833 /*
1834  *      Routine:        lck_rw_try_lock
1835  */
1836 boolean_t
1837 lck_rw_try_lock(
1838         lck_rw_t        *lck,
1839         lck_rw_type_t   lck_rw_type)
1840 {
1841         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1842                 return(lck_rw_try_lock_shared(lck));
1843         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1844                 return(lck_rw_try_lock_exclusive(lck));
1845         else
1846                 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1847         return(FALSE);
1848 }
1849
1850 /*
1851  *      Routine:        lck_rw_try_lock_shared
1852  */
1853
1854 boolean_t lck_rw_try_lock_shared(lck_rw_t *lock)
1855 {
1856         uint32_t        data, prev;
1857
1858         for ( ; ; ) {
1859                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1860                 if (data & LCK_RW_INTERLOCK) {
1861                         atomic_exchange_abort();
1862                         lck_rw_interlock_spin(lock);
1863                         continue;
1864                 }
1865                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1866                         atomic_exchange_abort();
1867                         return FALSE;                   /* lock is busy */
1868                 }
1869                 data += LCK_RW_SHARED_READER;           /* Increment reader refcount */
1870                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1871                         break;
1872                 cpu_pause();
1873         }
1874         current_thread()->rwlock_count++;
1875         /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1876 #if     CONFIG_DTRACE
1877         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1878 #endif  /* CONFIG_DTRACE */
1879         return TRUE;
1880 }
1881
1882
1883 /*
1884  *      Routine:        lck_rw_try_lock_exclusive
1885  */
1886
1887 boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock)
1888 {
1889         uint32_t        data, prev;
1890
1891         for ( ; ; ) {
1892                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1893                 if (data & LCK_RW_INTERLOCK) {
1894                         atomic_exchange_abort();
1895                         lck_rw_interlock_spin(lock);
1896                         continue;
1897                 }
1898                 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1899                         atomic_exchange_abort();
1900                         return FALSE;                           /* can't get it */
1901                 }
1902                 data |= LCK_RW_WANT_EXCL;
1903                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1904                         break;
1905                 cpu_pause();
1906         }
1907
1908         current_thread()->rwlock_count++;
1909 #if     CONFIG_DTRACE
1910         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1911 #endif  /* CONFIG_DTRACE */
1912         return TRUE;
1913 }
1914
1915
1916 void
1917 lck_rw_assert(
1918         lck_rw_t        *lck,
1919         unsigned int    type)
1920 {
1921         switch (type) {
1922         case LCK_RW_ASSERT_SHARED:
1923                 if (lck->lck_rw_shared_count != 0) {
1924                         return;
1925                 }
1926                 break;
1927         case LCK_RW_ASSERT_EXCLUSIVE:
1928                 if ((lck->lck_rw_want_write ||
1929                      lck->lck_rw_want_upgrade) &&
1930                     lck->lck_rw_shared_count == 0) {
1931                         return;
1932                 }
1933                 break;
1934         case LCK_RW_ASSERT_HELD:
1935                 if (lck->lck_rw_want_write ||
1936                     lck->lck_rw_want_upgrade ||
1937                     lck->lck_rw_shared_count != 0) {
1938                         return;
1939                 }
1940                 break;
1941         case LCK_RW_ASSERT_NOTHELD:
1942                 if (!(lck->lck_rw_want_write ||
1943                           lck->lck_rw_want_upgrade ||
1944                           lck->lck_rw_shared_count != 0)) {
1945                         return;
1946                 }
1947                 break;
1948         default:
1949                 break;
1950         }
1951
1952         panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1953 }
1954
1955 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1956 void
1957 lck_rw_clear_promotions_x86(thread_t thread)
1958 {
1959 #if MACH_LDEBUG
1960         /* It's fatal to leave a RW lock locked and return to userspace */
1961         panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1962 #else
1963         /* Paper over the issue */
1964         thread->rwlock_count = 0;
1965         lck_rw_clear_promotion(thread, 0);
1966 #endif
1967 }
1968
1969 boolean_t
1970 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
1971 {
1972         lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
1973
1974         if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
1975                 lck_rw_unlock_shared(lck);
1976                 mutex_pause(2);
1977                 lck_rw_lock_shared(lck);
1978                 return TRUE;
1979         }
1980
1981         return FALSE;
1982 }
1983
1984 /*
1985  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1986  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1987  */
1988 boolean_t
1989 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
1990         if (not_in_kdp) {
1991                 panic("panic: rw lock exclusive check done outside of kernel debugger");
1992         }
1993         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
1994 }
1995
1996 /*
1997  * Slow path routines for lck_mtx locking and unlocking functions.
1998  *
1999  * These functions were previously implemented in x86 assembly,
2000  * and some optimizations are in place in this c code to obtain a compiled code
2001  * as performant and compact as the assembly version.
2002  *
2003  * To avoid to inline these functions on the fast path, all functions directly called by
2004  * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2005  * in such a way the fast path can tail call into them. In this way the return address
2006  * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2007  *
2008  * Slow path code is structured in such a way there are no calls to functions that will return
2009  * on the context of the caller function, i.e. all functions called are or tail call functions
2010  * or inline functions. The number of arguments of the tail call functions are less then six,
2011  * so that they can be passed over registers and do not need to be pushed on stack.
2012  * This allows the compiler to not create a stack frame for the functions.
2013  *
2014  * __improbable and __probable are used to compile the slow path code in such a way
2015  * the fast path case will be on a sequence of instructions with as less jumps as possible,
2016  * to make this case the most optimized even if falling through the slow path.
2017  */
2018
2019 /*
2020  * Intel lock invariants:
2021  *
2022  * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2023  * lck_mtx_pri: contains the max priority of all waiters during a contention period
2024  *      not cleared on last unlock, but stomped over on next first contention
2025  * lck_mtx_promoted: set when the current lock owner has been promoted
2026  *      cleared when lock owner unlocks, set on acquire or wait.
2027  *
2028  * The lock owner is promoted to the max priority of all its waiters only if it
2029  * was a lower priority when it acquired or was an owner when a waiter waited.
2030  * Max priority is capped at MAXPRI_PROMOTE.
2031  *
2032  * The last waiter will not be promoted as it is woken up, but the last
2033  * lock owner may not have been the last thread to have been woken up depending on the
2034  * luck of the draw.  Therefore a last-owner may still have the promoted-on-wakeup
2035  * flag set.
2036  *
2037  * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2038  *       priority from dropping priority in the future without having to take thread lock
2039  *       on acquire.
2040  */
2041
2042 #ifdef  MUTEX_ZONE
2043 extern zone_t lck_mtx_zone;
2044 #endif
2045
2046 /*
2047  * N.B.: On x86, statistics are currently recorded for all indirect mutexes.
2048  * Also, only the acquire attempt count (GRP_MTX_STAT_UTIL) is maintained
2049  * as a 64-bit quantity (the new x86 specific statistics are also maintained
2050  * as 32-bit quantities).
2051  *
2052  *
2053  * Enable this preprocessor define to record the first miss alone
2054  * By default, we count every miss, hence multiple misses may be
2055  * recorded for a single lock acquire attempt via lck_mtx_lock
2056  */
2057 #undef LOG_FIRST_MISS_ALONE
2058
2059 /*
2060  * This preprocessor define controls whether the R-M-W update of the
2061  * per-group statistics elements are atomic (LOCK-prefixed)
2062  * Enabled by default.
2063  */
2064 #define ATOMIC_STAT_UPDATES 1
2065
2066
2067 /*
2068  *      Routine:        lck_mtx_alloc_init
2069  */
2070 lck_mtx_t *
2071 lck_mtx_alloc_init(
2072         lck_grp_t       *grp,
2073         lck_attr_t      *attr)
2074 {
2075         lck_mtx_t       *lck;
2076 #ifdef  MUTEX_ZONE
2077         if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
2078                 lck_mtx_init(lck, grp, attr);
2079 #else
2080         if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
2081                 lck_mtx_init(lck, grp, attr);
2082 #endif
2083         return(lck);
2084 }
2085
2086 /*
2087  *      Routine:        lck_mtx_free
2088  */
2089 void
2090 lck_mtx_free(
2091         lck_mtx_t       *lck,
2092         lck_grp_t       *grp)
2093 {
2094         lck_mtx_destroy(lck, grp);
2095 #ifdef  MUTEX_ZONE
2096         zfree(lck_mtx_zone, lck);
2097 #else
2098         kfree(lck, sizeof(lck_mtx_t));
2099 #endif
2100 }
2101
2102 /*
2103  *      Routine:        lck_mtx_ext_init
2104  */
2105 static void
2106 lck_mtx_ext_init(
2107         lck_mtx_ext_t   *lck,
2108         lck_grp_t       *grp,
2109         lck_attr_t      *attr)
2110 {
2111         bzero((void *)lck, sizeof(lck_mtx_ext_t));
2112
2113         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2114                 lck->lck_mtx_deb.type = MUTEX_TAG;
2115                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2116         }
2117
2118         lck->lck_mtx_grp = grp;
2119
2120         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
2121                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2122
2123         lck->lck_mtx.lck_mtx_is_ext = 1;
2124         lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
2125 }
2126
2127 /*
2128  *      Routine:        lck_mtx_init
2129  */
2130 void
2131 lck_mtx_init(
2132         lck_mtx_t       *lck,
2133         lck_grp_t       *grp,
2134         lck_attr_t      *attr)
2135 {
2136         lck_mtx_ext_t   *lck_ext;
2137         lck_attr_t      *lck_attr;
2138
2139         if (attr != LCK_ATTR_NULL)
2140                 lck_attr = attr;
2141         else
2142                 lck_attr = &LockDefaultLckAttr;
2143
2144         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2145                 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2146                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
2147                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2148                         lck->lck_mtx_ptr = lck_ext;
2149                 }
2150         } else {
2151                 lck->lck_mtx_owner = 0;
2152                 lck->lck_mtx_state = 0;
2153         }
2154         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2155         lck_grp_reference(grp);
2156         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2157 }
2158
2159 /*
2160  *      Routine:        lck_mtx_init_ext
2161  */
2162 void
2163 lck_mtx_init_ext(
2164         lck_mtx_t       *lck,
2165         lck_mtx_ext_t   *lck_ext,
2166         lck_grp_t       *grp,
2167         lck_attr_t      *attr)
2168 {
2169         lck_attr_t      *lck_attr;
2170
2171         if (attr != LCK_ATTR_NULL)
2172                 lck_attr = attr;
2173         else
2174                 lck_attr = &LockDefaultLckAttr;
2175
2176         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2177                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2178                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2179                 lck->lck_mtx_ptr = lck_ext;
2180         } else {
2181                 lck->lck_mtx_owner = 0;
2182                 lck->lck_mtx_state = 0;
2183         }
2184         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2185
2186         lck_grp_reference(grp);
2187         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2188 }
2189
2190 static void
2191 lck_mtx_lock_mark_destroyed(
2192         lck_mtx_t *mutex,
2193         boolean_t indirect)
2194 {
2195         uint32_t state;
2196
2197         if (indirect) {
2198                 /* convert to destroyed state */
2199                 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2200                 return;
2201         }
2202
2203         state = ordered_load_mtx_state(mutex);
2204         lck_mtx_interlock_lock(mutex, &state);
2205
2206         ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2207
2208         enable_preemption();
2209 }
2210
2211 /*
2212  *      Routine:        lck_mtx_destroy
2213  */
2214 void
2215 lck_mtx_destroy(
2216         lck_mtx_t       *lck,
2217         lck_grp_t       *grp)
2218 {
2219         boolean_t indirect;
2220
2221         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
2222                 return;
2223 #if MACH_LDEBUG
2224         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2225 #endif
2226         indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2227
2228         lck_mtx_lock_mark_destroyed(lck, indirect);
2229
2230         if (indirect)
2231                 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2232         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2233         lck_grp_deallocate(grp);
2234         return;
2235 }
2236
2237
2238 #if DEVELOPMENT | DEBUG
2239 __attribute__((noinline))
2240 void
2241 lck_mtx_owner_check_panic(
2242         lck_mtx_t       *lock)
2243 {
2244         thread_t owner = (thread_t)lock->lck_mtx_owner;
2245         panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2246 }
2247 #endif
2248
2249 __attribute__((always_inline))
2250 static boolean_t
2251 get_indirect_mutex(
2252         lck_mtx_t       **lock,
2253         uint32_t        *state)
2254 {
2255         *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2256         *state = ordered_load_mtx_state(*lock);
2257         return TRUE;
2258 }
2259
2260 /*
2261  * Routine:     lck_mtx_unlock_slow
2262  *
2263  * Unlocks a mutex held by current thread.
2264  *
2265  * It will wake up waiters if necessary and
2266  * drop promotions.
2267  *
2268  * Interlock can be held.
2269  */
2270 __attribute__((noinline))
2271 void
2272 lck_mtx_unlock_slow(
2273         lck_mtx_t       *lock)
2274 {
2275         thread_t        thread;
2276         uint32_t        state, prev;
2277         boolean_t       indirect = FALSE;
2278
2279         state = ordered_load_mtx_state(lock);
2280
2281         /* Is this an indirect mutex? */
2282         if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2283                 indirect = get_indirect_mutex(&lock, &state);
2284         }
2285
2286         thread = current_thread();
2287
2288 #if DEVELOPMENT | DEBUG
2289         thread_t owner = (thread_t)lock->lck_mtx_owner;
2290         if(__improbable(owner != thread))
2291                 return lck_mtx_owner_check_panic(lock);
2292 #endif
2293
2294         /* check if it is held as a spinlock */
2295         if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0))
2296                 goto unlock;
2297
2298         lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2299
2300 unlock:
2301         /* preemption disabled, interlock held and mutex not held */
2302
2303         /* clear owner */
2304         ordered_store_mtx_owner(lock, 0);
2305         /* keep original state in prev for later evaluation */
2306         prev = state;
2307         /* release interlock, promotion and clear spin flag */
2308         state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK | LCK_MTX_PROMOTED_MSK));
2309         if ((state & LCK_MTX_WAITERS_MSK))
2310                 state -= LCK_MTX_WAITER;        /* decrement waiter count */
2311         ordered_store_mtx_state_release(lock, state);           /* since I own the interlock, I don't need an atomic update */
2312
2313 #if     MACH_LDEBUG
2314         /* perform lock statistics after drop to prevent delay */
2315         if (thread)
2316                 thread->mutex_count--;          /* lock statistic */
2317 #endif  /* MACH_LDEBUG */
2318
2319         /* check if there are waiters to wake up or priority to drop */
2320         if ((prev & (LCK_MTX_PROMOTED_MSK | LCK_MTX_WAITERS_MSK)))
2321                 return lck_mtx_unlock_wakeup_tail(lock, prev, indirect);
2322
2323         /* re-enable preemption */
2324         lck_mtx_unlock_finish_inline(lock, FALSE);
2325
2326         return;
2327 }
2328
2329 #define LCK_MTX_LCK_WAIT_CODE           0x20
2330 #define LCK_MTX_LCK_WAKEUP_CODE         0x21
2331 #define LCK_MTX_LCK_SPIN_CODE           0x22
2332 #define LCK_MTX_LCK_ACQUIRE_CODE        0x23
2333 #define LCK_MTX_LCK_DEMOTE_CODE         0x24
2334
2335 /*
2336  * Routine:    lck_mtx_unlock_wakeup_tail
2337  *
2338  * Invoked on unlock when there is
2339  * contention, i.e. the assembly routine sees
2340  * that mutex->lck_mtx_waiters != 0 or
2341  * that mutex->lck_mtx_promoted != 0
2342  *
2343  * neither the mutex or interlock is held
2344  *
2345  * Note that this routine might not be called if there are pending
2346  * waiters which have previously been woken up, and they didn't
2347  * end up boosting the old owner.
2348  *
2349  * assembly routine previously did the following to mutex:
2350  * (after saving the state in prior_lock_state)
2351  *      cleared lck_mtx_promoted
2352  *      decremented lck_mtx_waiters if nonzero
2353  *
2354  * This function needs to be called as a tail call
2355  * to optimize the compiled code.
2356  */
2357 __attribute__((noinline))
2358 static void
2359 lck_mtx_unlock_wakeup_tail (
2360         lck_mtx_t       *mutex,
2361         int             prior_lock_state,
2362         boolean_t       indirect)
2363 {
2364         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2365         lck_mtx_t               fake_lck;
2366
2367         /*
2368          * prior_lock state is a snapshot of the 2nd word of the
2369          * lock in question... we'll fake up a lock with the bits
2370          * copied into place and carefully not access anything
2371          * beyond whats defined in the second word of a lck_mtx_t
2372          */
2373         fake_lck.lck_mtx_state = prior_lock_state;
2374
2375         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
2376                      trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
2377
2378         if (__probable(fake_lck.lck_mtx_waiters)) {
2379                 kern_return_t did_wake;
2380
2381                 if (fake_lck.lck_mtx_waiters > 1)
2382                         did_wake = thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
2383                 else
2384                         did_wake = thread_wakeup_one(LCK_MTX_EVENT(mutex));
2385                 /*
2386                  * The waiters count always precisely matches the number of threads on the waitqueue.
2387                  * i.e. we should never see ret == KERN_NOT_WAITING.
2388                  */
2389                 assert(did_wake == KERN_SUCCESS);
2390         }
2391
2392         /* When lck_mtx_promoted was set, then I as the owner definitely have a promotion */
2393         if (__improbable(fake_lck.lck_mtx_promoted)) {
2394                 thread_t thread = current_thread();
2395
2396                 spl_t s = splsched();
2397                 thread_lock(thread);
2398
2399                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
2400                              thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
2401                 assert(thread->was_promoted_on_wakeup == 0);
2402                 assert(thread->promotions > 0);
2403
2404                 assert_promotions_invariant(thread);
2405
2406                 if (--thread->promotions == 0)
2407                         sched_thread_unpromote(thread, trace_lck);
2408
2409                 assert_promotions_invariant(thread);
2410
2411                 thread_unlock(thread);
2412                 splx(s);
2413         }
2414
2415         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
2416                  trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2417
2418         lck_mtx_unlock_finish_inline(mutex, indirect);
2419 }
2420
2421 /*
2422  * Routine:     lck_mtx_lock_acquire_x86
2423  *
2424  * Invoked on acquiring the mutex when there is
2425  * contention (i.e. the assembly routine sees that
2426  * that mutex->lck_mtx_waiters != 0 or
2427  * thread->was_promoted_on_wakeup != 0)...
2428  *
2429  * mutex is owned...  interlock is held... preemption is disabled
2430  */
2431 __attribute__((always_inline))
2432 static void
2433 lck_mtx_lock_acquire_inline(
2434         lck_mtx_t       *mutex)
2435 {
2436         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2437         integer_t               priority;
2438
2439         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
2440                      trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2441
2442         if (mutex->lck_mtx_waiters)
2443                 priority = mutex->lck_mtx_pri;
2444         else
2445                 priority = 0; /* not worth resetting lck_mtx_pri here, it will be reset by next waiter */
2446
2447         /* the priority must have been set correctly by wait */
2448         assert(priority <= MAXPRI_PROMOTE);
2449         assert(priority == 0 || priority >= BASEPRI_DEFAULT);
2450
2451         /* if the mutex wasn't owned, then the owner wasn't promoted */
2452         assert(mutex->lck_mtx_promoted == 0);
2453
2454         thread_t thread = (thread_t)mutex->lck_mtx_owner;       /* faster than current_thread() */
2455
2456         if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
2457                 spl_t s = splsched();
2458                 thread_lock(thread);
2459
2460                 if (thread->was_promoted_on_wakeup)
2461                         assert(thread->promotions > 0);
2462
2463                 /* Intel only promotes if priority goes up */
2464                 if (thread->sched_pri < priority && thread->promotion_priority < priority) {
2465                         /* Remember that I need to drop this promotion on unlock */
2466                         mutex->lck_mtx_promoted = 1;
2467
2468                         if (thread->promotions++ == 0) {
2469                                 /* This is the first promotion for the owner */
2470                                 sched_thread_promote_to_pri(thread, priority, trace_lck);
2471                         } else {
2472                                 /*
2473                                  * Holder was previously promoted due to a different mutex,
2474                                  * raise to match this one.
2475                                  * Or, this thread was promoted on wakeup but someone else
2476                                  * later contended on mutex at higher priority before we got here
2477                                  */
2478                                 sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
2479                         }
2480                 }
2481
2482                 if (thread->was_promoted_on_wakeup) {
2483                         thread->was_promoted_on_wakeup = 0;
2484                         if (--thread->promotions == 0)
2485                                 sched_thread_unpromote(thread, trace_lck);
2486                 }
2487
2488                 thread_unlock(thread);
2489                 splx(s);
2490         }
2491         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
2492                      trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2493 }
2494
2495 void
2496 lck_mtx_lock_acquire_x86(
2497         lck_mtx_t       *mutex)
2498 {
2499         return lck_mtx_lock_acquire_inline(mutex);
2500 }
2501
2502 /*
2503  * Tail call helpers for lock functions that perform
2504  * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2505  * the caller's compiled code.
2506  */
2507
2508 __attribute__((noinline))
2509 static void
2510 lck_mtx_lock_acquire_tail(
2511         lck_mtx_t       *mutex,
2512         boolean_t       indirect)
2513 {
2514         lck_mtx_lock_acquire_inline(mutex);
2515         lck_mtx_lock_finish_inline(mutex, ordered_load_mtx_state(mutex), indirect);
2516 }
2517
2518 __attribute__((noinline))
2519 static boolean_t
2520 lck_mtx_try_lock_acquire_tail(
2521         lck_mtx_t       *mutex)
2522 {
2523         lck_mtx_lock_acquire_inline(mutex);
2524         lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2525
2526         return TRUE;
2527 }
2528
2529 __attribute__((noinline))
2530 static void
2531 lck_mtx_convert_spin_acquire_tail(
2532         lck_mtx_t       *mutex)
2533 {
2534         lck_mtx_lock_acquire_inline(mutex);
2535         lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2536 }
2537
2538 boolean_t
2539 lck_mtx_ilk_unlock(
2540         lck_mtx_t       *mutex)
2541 {
2542         lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2543         return TRUE;
2544 }
2545
2546 static inline void
2547 lck_mtx_interlock_lock_set_and_clear_flags(
2548         lck_mtx_t *mutex,
2549         uint32_t xor_flags,
2550         uint32_t and_flags,
2551         uint32_t *new_state)
2552 {
2553         uint32_t state, prev;
2554         state = *new_state;
2555
2556         for ( ; ; ) {
2557                 /* have to wait for interlock to clear */
2558                 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2559                         cpu_pause();
2560                         state = ordered_load_mtx_state(mutex);
2561                 }
2562                 prev = state;                                   /* prev contains snapshot for exchange */
2563                 state |= LCK_MTX_ILOCKED_MSK | xor_flags;       /* pick up interlock */
2564                 state &= ~and_flags;                            /* clear flags */
2565
2566                 disable_preemption();
2567                 if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE))
2568                         break;
2569                 enable_preemption();
2570                 cpu_pause();
2571                 state = ordered_load_mtx_state(mutex);
2572         }
2573         *new_state = state;
2574         return;
2575 }
2576
2577 static inline void
2578 lck_mtx_interlock_lock_clear_flags(
2579         lck_mtx_t *mutex,
2580         uint32_t and_flags,
2581         uint32_t *new_state)
2582 {
2583         return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2584 }
2585
2586 static inline void
2587 lck_mtx_interlock_lock(
2588         lck_mtx_t *mutex,
2589         uint32_t *new_state)
2590 {
2591         return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2592 }
2593
2594 static inline int
2595 lck_mtx_interlock_try_lock_set_flags(
2596         lck_mtx_t *mutex,
2597         uint32_t or_flags,
2598         uint32_t *new_state)
2599 {
2600         uint32_t state, prev;
2601         state = *new_state;
2602
2603         /* have to wait for interlock to clear */
2604         if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2605                 return 0;
2606         }
2607         prev = state;                                   /* prev contains snapshot for exchange */
2608         state |= LCK_MTX_ILOCKED_MSK | or_flags;        /* pick up interlock */
2609         disable_preemption();
2610         if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
2611                         *new_state = state;
2612                         return 1;
2613         }
2614
2615         enable_preemption();
2616         return 0;
2617 }
2618
2619 static inline int
2620 lck_mtx_interlock_try_lock(
2621         lck_mtx_t *mutex,
2622         uint32_t *new_state)
2623 {
2624         return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state);
2625 }
2626
2627 static inline int
2628 lck_mtx_interlock_try_lock_disable_interrupts(
2629         lck_mtx_t *mutex,
2630         boolean_t *istate)
2631 {
2632         uint32_t        state;
2633
2634         *istate = ml_set_interrupts_enabled(FALSE);
2635         state = ordered_load_mtx_state(mutex);
2636
2637         if (lck_mtx_interlock_try_lock(mutex, &state)) {
2638                 return 1;
2639         } else {
2640                 ml_set_interrupts_enabled(*istate);
2641                 return 0;
2642         }
2643 }
2644
2645 static inline void
2646 lck_mtx_interlock_unlock_enable_interrupts(
2647         lck_mtx_t *mutex,
2648         boolean_t istate)
2649 {
2650         lck_mtx_ilk_unlock(mutex);
2651         ml_set_interrupts_enabled(istate);
2652 }
2653
2654 static void __inline__
2655 lck_mtx_inc_stats(
2656         uint64_t* stat)
2657 {
2658 #if ATOMIC_STAT_UPDATES
2659                 os_atomic_inc(stat, relaxed);
2660 #else
2661                 *stat = (*stat)++;
2662 #endif
2663 }
2664
2665 static void __inline__
2666 lck_mtx_update_miss(
2667         struct _lck_mtx_ext_ *lock,
2668         int *first_miss)
2669 {
2670 #if LOG_FIRST_MISS_ALONE
2671         if ((*first_miss & 1) == 0) {
2672 #else
2673 #pragma unused(first_miss)
2674 #endif
2675                 uint64_t* stat = &lock->lck_mtx_grp->lck_grp_miss;
2676                 lck_mtx_inc_stats(stat);
2677
2678 #if LOG_FIRST_MISS_ALONE
2679                 *first_miss |= 1;
2680         }
2681 #endif
2682 }
2683
2684 static void __inline__
2685 lck_mtx_update_direct_wait(
2686         struct _lck_mtx_ext_ *lock)
2687 {
2688         uint64_t* stat = &lock->lck_mtx_grp->lck_grp_direct_wait;
2689         lck_mtx_inc_stats(stat);
2690 }
2691
2692 static void __inline__
2693 lck_mtx_update_wait(
2694         struct _lck_mtx_ext_ *lock,
2695         int *first_miss)
2696 {
2697 #if LOG_FIRST_MISS_ALONE
2698         if ((*first_miss & 2) == 0) {
2699 #else
2700 #pragma unused(first_miss)
2701 #endif
2702                 uint64_t* stat = &lock->lck_mtx_grp->lck_grp_wait;
2703                 lck_mtx_inc_stats(stat);
2704
2705 #if LOG_FIRST_MISS_ALONE
2706                 *first_miss |= 2;
2707         }
2708 #endif
2709 }
2710
2711 static void __inline__
2712 lck_mtx_update_util(
2713         struct _lck_mtx_ext_ *lock)
2714 {
2715         uint64_t* stat = &lock->lck_mtx_grp->lck_grp_util;
2716         lck_mtx_inc_stats(stat);
2717 }
2718
2719 __attribute__((noinline))
2720 static void
2721 lck_mtx_lock_contended(
2722         lck_mtx_t       *lock,
2723         boolean_t indirect,
2724         boolean_t *first_miss)
2725 {
2726         lck_mtx_spinwait_ret_type_t ret;
2727         uint32_t state;
2728         thread_t thread;
2729
2730 try_again:
2731
2732         if (indirect) {
2733                 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
2734         }
2735
2736         ret = lck_mtx_lock_spinwait_x86(lock);
2737         state = ordered_load_mtx_state(lock);
2738         switch (ret) {
2739         case LCK_MTX_SPINWAIT_NO_SPIN:
2740                 /*
2741                  * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2742                  * try to spin.
2743                  */
2744                 if (indirect) {
2745                         lck_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
2746                 }
2747
2748                 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2749         case LCK_MTX_SPINWAIT_SPUN:
2750                 /*
2751                  * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2752                  * interlock not held
2753                  */
2754                 lck_mtx_interlock_lock(lock, &state);
2755                 assert(state & LCK_MTX_ILOCKED_MSK);
2756
2757                 if (state & LCK_MTX_MLOCKED_MSK) {
2758                         if (indirect) {
2759                                 lck_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
2760                         }
2761                         lck_mtx_lock_wait_x86(lock);
2762                         /*
2763                          * interlock is not held here.
2764                          */
2765                         goto try_again;
2766                 } else {
2767
2768                         /* grab the mutex */
2769                         state |= LCK_MTX_MLOCKED_MSK;
2770                         ordered_store_mtx_state_release(lock, state);
2771                         thread = current_thread();
2772                         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2773 #if     MACH_LDEBUG
2774                         if (thread) {
2775                                 thread->mutex_count++;
2776                         }
2777 #endif  /* MACH_LDEBUG */
2778                 }
2779
2780                 break;
2781         case LCK_MTX_SPINWAIT_ACQUIRED:
2782                 /*
2783                  * mutex has been acquired by lck_mtx_lock_spinwait_x86
2784                  * interlock is held and preemption disabled
2785                  * owner is set and mutex marked as locked
2786                  * statistics updated too
2787                  */
2788                 break;
2789         default:
2790                 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2791         }
2792
2793         /*
2794          * interlock is already acquired here
2795          */
2796
2797         /* mutex has been acquired */
2798         thread = (thread_t)lock->lck_mtx_owner;
2799         if (state & LCK_MTX_WAITERS_MSK || thread->was_promoted_on_wakeup) {
2800                 return lck_mtx_lock_acquire_tail(lock, indirect);
2801         }
2802
2803         /* release the interlock */
2804         lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2805 }
2806
2807 /*
2808  * Helper noinline functions for calling
2809  * panic to optimize compiled code.
2810  */
2811
2812 __attribute__((noinline))
2813 static void
2814 lck_mtx_destroyed(
2815         lck_mtx_t       *lock)
2816 {
2817         panic("trying to interlock destroyed mutex (%p)", lock);
2818 }
2819
2820 __attribute__((noinline))
2821 static boolean_t
2822 lck_mtx_try_destroyed(
2823         lck_mtx_t       *lock)
2824 {
2825         panic("trying to interlock destroyed mutex (%p)", lock);
2826         return FALSE;
2827 }
2828
2829 __attribute__((always_inline))
2830 static boolean_t
2831 lck_mtx_lock_wait_interlock_to_clear(
2832         lck_mtx_t       *lock,
2833         uint32_t*        new_state)
2834 {
2835         uint32_t state;
2836
2837         for ( ; ; ) {
2838                 cpu_pause();
2839                 state = ordered_load_mtx_state(lock);
2840                 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2841                         *new_state = state;
2842                         return TRUE;
2843                 }
2844                 if (state & LCK_MTX_MLOCKED_MSK) {
2845                         /* if it is held as mutex, just fail */
2846                         return FALSE;
2847                 }
2848         }
2849 }
2850
2851 __attribute__((always_inline))
2852 static boolean_t
2853 lck_mtx_try_lock_wait_interlock_to_clear(
2854         lck_mtx_t       *lock,
2855         uint32_t*        new_state)
2856 {
2857         uint32_t state;
2858
2859         for ( ; ; ) {
2860                 cpu_pause();
2861                 state = ordered_load_mtx_state(lock);
2862                 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2863                         /* if it is held as mutex or spin, just fail */
2864                         return FALSE;
2865                 }
2866                 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2867                         *new_state = state;
2868                         return TRUE;
2869                 }
2870         }
2871 }
2872
2873 /*
2874  * Routine:     lck_mtx_lock_slow
2875  *
2876  * Locks a mutex for current thread.
2877  * If the lock is contended this function might
2878  * sleep.
2879  *
2880  * Called with interlock not held.
2881  */
2882 __attribute__((noinline))
2883 void
2884 lck_mtx_lock_slow(
2885         lck_mtx_t       *lock)
2886 {
2887         boolean_t       indirect = FALSE;
2888         uint32_t        state;
2889         int             first_miss = 0;
2890
2891         state = ordered_load_mtx_state(lock);
2892
2893         /* is the interlock or mutex held */
2894         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2895                 /*
2896                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2897                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2898                  * set in state (state == lck_mtx_tag)
2899                  */
2900
2901
2902                 /* is the mutex already held and not indirect */
2903                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2904                         /* no, must have been the mutex */
2905                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2906                 }
2907
2908                 /* check to see if it is marked destroyed */
2909                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2910                         return lck_mtx_destroyed(lock);
2911                 }
2912
2913                 /* Is this an indirect mutex? */
2914                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2915                         indirect = get_indirect_mutex(&lock, &state);
2916
2917                         first_miss = 0;
2918                         lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
2919
2920                         if (state & LCK_MTX_SPIN_MSK) {
2921                                  /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2922                                 assert(state & LCK_MTX_ILOCKED_MSK);
2923                                 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2924                         }
2925                 }
2926
2927                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2928                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2929                 }
2930         }
2931
2932         /* no - can't be INDIRECT, DESTROYED or locked */
2933         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2934                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2935                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2936                 }
2937         }
2938
2939         /* lock and interlock acquired */
2940
2941         thread_t thread = current_thread();
2942         /* record owner of mutex */
2943         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2944
2945 #if MACH_LDEBUG
2946         if (thread) {
2947                 thread->mutex_count++;          /* lock statistic */
2948         }
2949 #endif
2950         /*
2951          * Check if there are waiters to
2952          * inherit their priority.
2953          */
2954         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2955                 return lck_mtx_lock_acquire_tail(lock, indirect);
2956         }
2957
2958         /* release the interlock */
2959         lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2960
2961         return;
2962 }
2963
2964 __attribute__((noinline))
2965 boolean_t
2966 lck_mtx_try_lock_slow(
2967         lck_mtx_t       *lock)
2968 {
2969         boolean_t       indirect = FALSE;
2970         uint32_t        state;
2971         int             first_miss = 0;
2972
2973         state = ordered_load_mtx_state(lock);
2974
2975         /* is the interlock or mutex held */
2976         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2977                 /*
2978                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2979                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2980                  * set in state (state == lck_mtx_tag)
2981                  */
2982
2983                 /* is the mutex already held and not indirect */
2984                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2985                         return FALSE;
2986                 }
2987
2988                 /* check to see if it is marked destroyed */
2989                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2990                         return lck_mtx_try_destroyed(lock);
2991                 }
2992
2993                 /* Is this an indirect mutex? */
2994                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2995                         indirect = get_indirect_mutex(&lock, &state);
2996
2997                         first_miss = 0;
2998                         lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
2999                 }
3000
3001                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3002                         if (indirect)
3003                                 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3004                         return FALSE;
3005                 }
3006         }
3007
3008         /* no - can't be INDIRECT, DESTROYED or locked */
3009         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
3010                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3011                         if (indirect)
3012                                 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3013                         return FALSE;
3014                 }
3015         }
3016
3017         /* lock and interlock acquired */
3018
3019         thread_t thread = current_thread();
3020         /* record owner of mutex */
3021         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3022
3023 #if MACH_LDEBUG
3024         if (thread) {
3025                 thread->mutex_count++;          /* lock statistic */
3026         }
3027 #endif
3028         /*
3029          * Check if there are waiters to
3030          * inherit their priority.
3031          */
3032         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3033                 return lck_mtx_try_lock_acquire_tail(lock);
3034         }
3035
3036         /* release the interlock */
3037         lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
3038
3039         return TRUE;
3040
3041 }
3042
3043 __attribute__((noinline))
3044 void
3045 lck_mtx_lock_spin_slow(
3046         lck_mtx_t       *lock)
3047 {
3048         boolean_t       indirect = FALSE;
3049         uint32_t        state;
3050         int             first_miss = 0;
3051
3052         state = ordered_load_mtx_state(lock);
3053
3054         /* is the interlock or mutex held */
3055         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3056                 /*
3057                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3058                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3059                  * set in state (state == lck_mtx_tag)
3060                  */
3061
3062
3063                 /* is the mutex already held and not indirect */
3064                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
3065                         /* no, must have been the mutex */
3066                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3067                 }
3068
3069                 /* check to see if it is marked destroyed */
3070                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3071                         return lck_mtx_destroyed(lock);
3072                 }
3073
3074                 /* Is this an indirect mutex? */
3075                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3076                         indirect = get_indirect_mutex(&lock, &state);
3077
3078                         first_miss = 0;
3079                         lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
3080
3081                         if (state & LCK_MTX_SPIN_MSK) {
3082                                  /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3083                                 assert(state & LCK_MTX_ILOCKED_MSK);
3084                                 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3085                         }
3086                 }
3087
3088                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3089                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3090                 }
3091         }
3092
3093         /* no - can't be INDIRECT, DESTROYED or locked */
3094         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) {
3095                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3096                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3097                 }
3098         }
3099
3100         /* lock as spinlock and interlock acquired */
3101
3102         thread_t thread = current_thread();
3103         /* record owner of mutex */
3104         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3105
3106 #if MACH_LDEBUG
3107         if (thread) {
3108                 thread->mutex_count++;          /* lock statistic */
3109         }
3110 #endif
3111
3112 #if     CONFIG_DTRACE
3113         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3114 #endif
3115         /* return with the interlock held and preemption disabled */
3116         return;
3117 }
3118
3119 __attribute__((noinline))
3120 boolean_t
3121 lck_mtx_try_lock_spin_slow(
3122         lck_mtx_t       *lock)
3123 {
3124         boolean_t       indirect = FALSE;
3125         uint32_t        state;
3126         int             first_miss = 0;
3127
3128         state = ordered_load_mtx_state(lock);
3129
3130         /* is the interlock or mutex held */
3131         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3132                 /*
3133                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3134                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3135                  * set in state (state == lck_mtx_tag)
3136                  */
3137
3138                 /* is the mutex already held and not indirect */
3139                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
3140                         return FALSE;
3141                 }
3142
3143                 /* check to see if it is marked destroyed */
3144                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3145                         return lck_mtx_try_destroyed(lock);
3146                 }
3147
3148                 /* Is this an indirect mutex? */
3149                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3150                         indirect = get_indirect_mutex(&lock, &state);
3151
3152                         first_miss = 0;
3153                         lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
3154                 }
3155
3156                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3157                         if (indirect)
3158                                 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3159                         return FALSE;
3160                 }
3161         }
3162
3163         /* no - can't be INDIRECT, DESTROYED or locked */
3164         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3165                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3166                         if (indirect)
3167                                 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3168                         return FALSE;
3169                 }
3170         }
3171
3172         /* lock and interlock acquired */
3173
3174         thread_t thread = current_thread();
3175         /* record owner of mutex */
3176         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3177
3178 #if MACH_LDEBUG
3179         if (thread) {
3180                 thread->mutex_count++;          /* lock statistic */
3181         }
3182 #endif
3183
3184 #if     CONFIG_DTRACE
3185         LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3186 #endif
3187         return TRUE;
3188
3189 }
3190
3191 __attribute__((noinline))
3192 void
3193 lck_mtx_convert_spin(
3194         lck_mtx_t       *lock)
3195 {
3196         uint32_t state;
3197
3198         state = ordered_load_mtx_state(lock);
3199
3200         /* Is this an indirect mutex? */
3201         if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3202                 /* If so, take indirection */
3203                 get_indirect_mutex(&lock, &state);
3204         }
3205
3206         assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3207
3208         if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3209                 /* already owned as a mutex, just return */
3210                 return;
3211         }
3212
3213         assert(get_preemption_level() > 0);
3214         assert(state & LCK_MTX_ILOCKED_MSK);
3215         assert(state & LCK_MTX_SPIN_MSK);
3216
3217         /*
3218          * Check if there are waiters to
3219          * inherit their priority.
3220          */
3221         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3222                 return lck_mtx_convert_spin_acquire_tail(lock);
3223         }
3224
3225         lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3226
3227         return;
3228 }
3229
3230 static inline boolean_t
3231 lck_mtx_lock_grab_mutex(
3232         lck_mtx_t       *lock)
3233 {
3234         uint32_t state;
3235
3236         state = ordered_load_mtx_state(lock);
3237
3238         if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3239                 return FALSE;
3240         }
3241
3242         /* lock and interlock acquired */
3243
3244         thread_t thread = current_thread();
3245         /* record owner of mutex */
3246         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3247
3248 #if MACH_LDEBUG
3249         if (thread) {
3250                 thread->mutex_count++;          /* lock statistic */
3251         }
3252 #endif
3253         return TRUE;
3254 }
3255
3256 __attribute__((noinline))
3257 void
3258 lck_mtx_assert(
3259         lck_mtx_t       *lock,
3260         unsigned int    type)
3261 {
3262         thread_t thread, owner;
3263         uint32_t state;
3264
3265         thread = current_thread();
3266         state = ordered_load_mtx_state(lock);
3267
3268         if (state == LCK_MTX_TAG_INDIRECT) {
3269                 get_indirect_mutex(&lock, &state);
3270         }
3271
3272         owner = (thread_t)lock->lck_mtx_owner;
3273
3274         if (type == LCK_MTX_ASSERT_OWNED) {
3275                 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))
3276                         panic("mutex (%p) not owned\n", lock);
3277         } else {
3278                 assert (type == LCK_MTX_ASSERT_NOTOWNED);
3279                 if (owner == thread)
3280                         panic("mutex (%p) owned\n", lock);
3281         }
3282 }
3283
3284 /*
3285  * Routine:     lck_mtx_lock_spinwait_x86
3286  *
3287  * Invoked trying to acquire a mutex when there is contention but
3288  * the holder is running on another processor. We spin for up to a maximum
3289  * time waiting for the lock to be released.
3290  *
3291  * Called with the interlock unlocked.
3292  * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3293  * returns LCK_MTX_SPINWAIT_SPUN if we spun
3294  * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3295  */
3296 __attribute__((noinline))
3297 lck_mtx_spinwait_ret_type_t
3298 lck_mtx_lock_spinwait_x86(
3299         lck_mtx_t       *mutex)
3300 {
3301         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3302         thread_t        holder;
3303         uint64_t        overall_deadline;
3304         uint64_t        check_owner_deadline;
3305         uint64_t        cur_time;
3306         lck_mtx_spinwait_ret_type_t             retval = LCK_MTX_SPINWAIT_SPUN;
3307         int             loopcount = 0;
3308
3309         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
3310                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
3311
3312         cur_time = mach_absolute_time();
3313         overall_deadline = cur_time + MutexSpin;
3314         check_owner_deadline = cur_time;
3315
3316         /*
3317          * Spin while:
3318          *   - mutex is locked, and
3319          *   - its locked as a spin lock, and
3320          *   - owner is running on another processor, and
3321          *   - owner (processor) is not idling, and
3322          *   - we haven't spun for long enough.
3323          */
3324         do {
3325                 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
3326                         retval = LCK_MTX_SPINWAIT_ACQUIRED;
3327                         break;
3328                 }
3329                 cur_time = mach_absolute_time();
3330
3331                 if (cur_time >= overall_deadline)
3332                         break;
3333
3334                 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
3335                         boolean_t       istate;
3336
3337                         /*
3338                          * We will repeatedly peek at the state of the lock while spinning,
3339                          * and we will acquire the interlock to do so.
3340                          * The thread that will unlock the mutex will also need to acquire
3341                          * the interlock, and we want to avoid to slow it down.
3342                          * To avoid to get an interrupt while holding the interlock
3343                          * and increase the time we are holding it, we
3344                          * will try to acquire the interlock with interrupts disabled.
3345                          * This is safe because it is a "try_lock", if we can't acquire
3346                          * the interlock we re-enable the interrupts and fail, so it is
3347                          * ok to call it even if the interlock was already held.
3348                         */
3349                         if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) {
3350
3351                                 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
3352
3353                                         if ( !(holder->machine.specFlags & OnProc) ||
3354                                              (holder->state & TH_IDLE)) {
3355
3356                                                 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3357
3358                                                 if (loopcount == 0)
3359                                                         retval = LCK_MTX_SPINWAIT_NO_SPIN;
3360                                                 break;
3361                                         }
3362                                 }
3363                                 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3364
3365                                 check_owner_deadline = cur_time + (MutexSpin / 4);
3366                         }
3367                 }
3368                 cpu_pause();
3369
3370                 loopcount++;
3371
3372         } while (TRUE);
3373
3374 #if     CONFIG_DTRACE
3375         /*
3376          * We've already kept a count via overall_deadline of how long we spun.
3377          * If dtrace is active, then we compute backwards to decide how
3378          * long we spun.
3379          *
3380          * Note that we record a different probe id depending on whether
3381          * this is a direct or indirect mutex.  This allows us to
3382          * penalize only lock groups that have debug/stats enabled
3383          * with dtrace processing if desired.
3384          */
3385         if (__probable(mutex->lck_mtx_is_ext == 0)) {
3386                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3387                         mach_absolute_time() - (overall_deadline - MutexSpin));
3388         } else {
3389                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3390                         mach_absolute_time() - (overall_deadline - MutexSpin));
3391         }
3392         /* The lockstat acquire event is recorded by the assembly code beneath us. */
3393 #endif
3394
3395         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
3396                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
3397
3398         return retval;
3399 }
3400
3401
3402
3403 /*
3404  * Routine:     lck_mtx_lock_wait_x86
3405  *
3406  * Invoked in order to wait on contention.
3407  *
3408  * Called with the interlock locked and
3409  * preemption disabled...
3410  * returns it unlocked and with preemption enabled
3411  *
3412  * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3413  *      A runnable waiter can exist between wait and acquire
3414  *      without a waiters count being set.
3415  *      This allows us to never make a spurious wakeup call.
3416  *
3417  * Priority:
3418  *      This avoids taking the thread lock if the owning thread is the same priority.
3419  *      This optimizes the case of same-priority threads contending on a lock.
3420  *      However, that allows the owning thread to drop in priority while holding the lock,
3421  *      because there is no state that the priority change can notice that
3422  *      says that the targeted thread holds a contended mutex.
3423  *
3424  *      One possible solution: priority changes could look for some atomic tag
3425  *      on the thread saying 'holding contended lock', and then set up a promotion.
3426  *      Needs a story for dropping that promotion - the last contended unlock
3427  *      has to notice that this has happened.
3428  */
3429 __attribute__((noinline))
3430 void
3431 lck_mtx_lock_wait_x86 (
3432         lck_mtx_t       *mutex)
3433 {
3434 #if     CONFIG_DTRACE
3435         uint64_t sleep_start = 0;
3436
3437         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3438                 sleep_start = mach_absolute_time();
3439         }
3440 #endif
3441         thread_t self = current_thread();
3442         assert(self->waiting_for_mutex == NULL);
3443
3444         self->waiting_for_mutex = mutex;
3445
3446         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3447
3448         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
3449                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3450                      mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
3451
3452         integer_t waiter_pri = self->sched_pri;
3453         waiter_pri = MAX(waiter_pri, self->base_pri);
3454         waiter_pri = MAX(waiter_pri, BASEPRI_DEFAULT);
3455         waiter_pri = MIN(waiter_pri, MAXPRI_PROMOTE);
3456
3457         assert(mutex->lck_mtx_pri <= MAXPRI_PROMOTE);
3458
3459         /* Re-initialize lck_mtx_pri if this is the first contention */
3460         if (mutex->lck_mtx_waiters == 0 || mutex->lck_mtx_pri <= waiter_pri)
3461                 mutex->lck_mtx_pri = waiter_pri;
3462
3463         thread_t holder = (thread_t)mutex->lck_mtx_owner;
3464
3465         assert(holder != NULL);
3466
3467         /*
3468          * Intel only causes a promotion when priority needs to change,
3469          * reducing thread lock holds but leaving us vulnerable to the holder
3470          * dropping priority.
3471          */
3472         if (holder->sched_pri < mutex->lck_mtx_pri) {
3473                 int promote_pri = mutex->lck_mtx_pri;
3474
3475                 spl_t s = splsched();
3476                 thread_lock(holder);
3477
3478                 /* Check again in case sched_pri changed */
3479                 if (holder->sched_pri < promote_pri && holder->promotion_priority < promote_pri) {
3480                         if (mutex->lck_mtx_promoted == 0) {
3481                                 /* This is the first promotion for this mutex */
3482                                 mutex->lck_mtx_promoted = 1;
3483
3484                                 if (holder->promotions++ == 0) {
3485                                         /* This is the first promotion for holder */
3486                                         sched_thread_promote_to_pri(holder, promote_pri, trace_lck);
3487                                 } else {
3488                                         /*
3489                                          * Holder was previously promoted due to a different mutex,
3490                                          * check if it needs to raise to match this one
3491                                          */
3492                                         sched_thread_update_promotion_to_pri(holder, promote_pri,
3493                                                                              trace_lck);
3494                                 }
3495                         } else {
3496                                 /*
3497                                  * Holder was previously promoted due to this mutex,
3498                                  * check if the pri needs to go up
3499                                  */
3500                                 sched_thread_update_promotion_to_pri(holder, promote_pri, trace_lck);
3501                         }
3502                 }
3503
3504                 thread_unlock(holder);
3505                 splx(s);
3506         }
3507
3508         mutex->lck_mtx_waiters++;
3509
3510         thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3511         assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
3512
3513         lck_mtx_ilk_unlock(mutex);
3514
3515         thread_block(THREAD_CONTINUE_NULL);
3516
3517         self->waiting_for_mutex = NULL;
3518
3519         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
3520                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3521                      mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
3522
3523 #if     CONFIG_DTRACE
3524         /*
3525          * Record the Dtrace lockstat probe for blocking, block time
3526          * measured from when we were entered.
3527          */
3528         if (sleep_start) {
3529                 if (mutex->lck_mtx_is_ext == 0) {
3530                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3531                             mach_absolute_time() - sleep_start);
3532                 } else {
3533                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3534                             mach_absolute_time() - sleep_start);
3535                 }
3536         }
3537 #endif
3538 }
3539
3540 /*
3541  *      Routine: kdp_lck_mtx_lock_spin_is_acquired
3542  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3543  *      Returns: TRUE if lock is acquired.
3544  */
3545 boolean_t
3546 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t     *lck)
3547 {
3548         if (not_in_kdp) {
3549                 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3550         }
3551
3552         if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3553                 return TRUE;
3554         }
3555
3556         return FALSE;
3557 }
3558
3559 void
3560 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3561 {
3562         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3563         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3564         thread_t holder   = (thread_t)mutex->lck_mtx_owner;
3565         waitinfo->owner   = thread_tid(holder);
3566 }
3567
3568 void
3569 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3570 {
3571         lck_rw_t *rwlck = NULL;
3572         switch(waitinfo->wait_type) {
3573                 case kThreadWaitKernelRWLockRead:
3574                         rwlck = READ_EVENT_TO_RWLOCK(event);
3575                         break;
3576                 case kThreadWaitKernelRWLockWrite:
3577                 case kThreadWaitKernelRWLockUpgrade:
3578                         rwlck = WRITE_EVENT_TO_RWLOCK(event);
3579                         break;
3580                 default:
3581                         panic("%s was called with an invalid blocking type", __FUNCTION__);
3582                         break;
3583         }
3584         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3585         waitinfo->owner = 0;
3586 }