osfmk/i386/locks_i386.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  *      File:   kern/lock.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *      Date:   1985
  60  *
  61  *      Locking primitives implementation
  62  */
  63
  64 #define LOCK_PRIVATE 1
  65
  66 #include <mach_ldebug.h>
  67
  68 #include <kern/lock_stat.h>
  69 #include <kern/locks.h>
  70 #include <kern/kalloc.h>
  71 #include <kern/misc_protos.h>
  72 #include <kern/thread.h>
  73 #include <kern/processor.h>
  74 #include <kern/cpu_data.h>
  75 #include <kern/cpu_number.h>
  76 #include <kern/sched_prim.h>
  77 #include <kern/debug.h>
  78 #include <string.h>
  79
  80 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
  81 #include <machine/atomic.h>
  82 #include <machine/machine_cpu.h>
  83 #include <i386/mp.h>
  84 #include <machine/atomic.h>
  85 #include <sys/kdebug.h>
  86 #include <i386/locks_i386_inlines.h>
  87
  88 #if     CONFIG_DTRACE
  89 #define DTRACE_RW_SHARED        0x0     //reader
  90 #define DTRACE_RW_EXCL          0x1     //writer
  91 #define DTRACE_NO_FLAG          0x0     //not applicable
  92 #endif /* CONFIG_DTRACE */
  93
  94 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  95 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
  96 #define LCK_RW_LCK_SHARED_CODE          0x102
  97 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
  98 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
  99 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 100
 101 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
 102 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
 103 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
 104 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
 105 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
 106 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
 107 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
 108 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
 109
 110
 111 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 112
 113 unsigned int LcksOpts=0;
 114
 115 #if DEVELOPMENT || DEBUG
 116 unsigned int LckDisablePreemptCheck = 0;
 117 #endif
 118
 119 /* Forwards */
 120
 121 #if     USLOCK_DEBUG
 122 /*
 123  *      Perform simple lock checks.
 124  */
 125 int     uslock_check = 1;
 126 int     max_lock_loops  = 100000000;
 127 decl_simple_lock_data(extern , printf_lock);
 128 decl_simple_lock_data(extern , panic_lock);
 129 #endif  /* USLOCK_DEBUG */
 130
 131 extern unsigned int not_in_kdp;
 132
 133 /*
 134  *      We often want to know the addresses of the callers
 135  *      of the various lock routines.  However, this information
 136  *      is only used for debugging and statistics.
 137  */
 138 typedef void    *pc_t;
 139 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 140 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 141 #if     ANY_LOCK_DEBUG
 142 #define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
 143 #define DECL_PC(pc)     pc_t pc;
 144 #else   /* ANY_LOCK_DEBUG */
 145 #define DECL_PC(pc)
 146 #ifdef  lint
 147 /*
 148  *      Eliminate lint complaints about unused local pc variables.
 149  */
 150 #define OBTAIN_PC(pc)   ++pc
 151 #else   /* lint */
 152 #define OBTAIN_PC(pc)
 153 #endif  /* lint */
 154 #endif  /* USLOCK_DEBUG */
 155
 156 /*
 157  * atomic exchange API is a low level abstraction of the operations
 158  * to atomically read, modify, and write a pointer.  This abstraction works
 159  * for both Intel and ARMv8.1 compare and exchange atomic instructions as
 160  * well as the ARM exclusive instructions.
 161  *
 162  * atomic_exchange_begin() - begin exchange and retrieve current value
 163  * atomic_exchange_complete() - conclude an exchange
 164  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
 165  */
 166 static uint32_t
 167 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 168 {
 169         uint32_t        val;
 170
 171         (void)ord;                      // Memory order not used
 172         val = os_atomic_load(target, relaxed);
 173         *previous = val;
 174         return val;
 175 }
 176
 177 static boolean_t
 178 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 179 {
 180         return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
 181 }
 182
 183 static void
 184 atomic_exchange_abort(void) { }
 185
 186 static boolean_t
 187 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 188 {
 189         uint32_t        value, prev;
 190
 191         for ( ; ; ) {
 192                 value = atomic_exchange_begin32(target, &prev, ord);
 193                 if (value & test_mask) {
 194                         if (wait)
 195                                 cpu_pause();
 196                         else
 197                                 atomic_exchange_abort();
 198                         return FALSE;
 199                 }
 200                 value |= set_mask;
 201                 if (atomic_exchange_complete32(target, prev, value, ord))
 202                         return TRUE;
 203         }
 204 }
 205
 206 inline boolean_t
 207 hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 208 {
 209         return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
 210 }
 211
 212 /*
 213  *      Portable lock package implementation of usimple_locks.
 214  */
 215
 216 #if     USLOCK_DEBUG
 217 #define USLDBG(stmt)    stmt
 218 void            usld_lock_init(usimple_lock_t, unsigned short);
 219 void            usld_lock_pre(usimple_lock_t, pc_t);
 220 void            usld_lock_post(usimple_lock_t, pc_t);
 221 void            usld_unlock(usimple_lock_t, pc_t);
 222 void            usld_lock_try_pre(usimple_lock_t, pc_t);
 223 void            usld_lock_try_post(usimple_lock_t, pc_t);
 224 int             usld_lock_common_checks(usimple_lock_t, char *);
 225 #else   /* USLOCK_DEBUG */
 226 #define USLDBG(stmt)
 227 #endif  /* USLOCK_DEBUG */
 228
 229 /*
 230  * Forward definitions
 231  */
 232
 233 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
 234 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
 235 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 236 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 237 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 238 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 239 void lck_rw_clear_promotions_x86(thread_t thread);
 240 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
 241 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
 242 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
 243 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect);
 244 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
 245 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
 246 static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state);
 247 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
 248 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
 249 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
 250
 251
 252 /*
 253  *      Routine:        lck_spin_alloc_init
 254  */
 255 lck_spin_t *
 256 lck_spin_alloc_init(
 257         lck_grp_t       *grp,
 258         lck_attr_t      *attr)
 259 {
 260         lck_spin_t      *lck;
 261
 262         if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
 263                 lck_spin_init(lck, grp, attr);
 264
 265         return(lck);
 266 }
 267
 268 /*
 269  *      Routine:        lck_spin_free
 270  */
 271 void
 272 lck_spin_free(
 273         lck_spin_t      *lck,
 274         lck_grp_t       *grp)
 275 {
 276         lck_spin_destroy(lck, grp);
 277         kfree(lck, sizeof(lck_spin_t));
 278 }
 279
 280 /*
 281  *      Routine:        lck_spin_init
 282  */
 283 void
 284 lck_spin_init(
 285         lck_spin_t      *lck,
 286         lck_grp_t       *grp,
 287         __unused lck_attr_t     *attr)
 288 {
 289         usimple_lock_init((usimple_lock_t) lck, 0);
 290         if (grp) {
 291                 lck_grp_reference(grp);
 292                 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 293         }
 294 }
 295
 296 /*
 297  *      Routine:        lck_spin_destroy
 298  */
 299 void
 300 lck_spin_destroy(
 301         lck_spin_t      *lck,
 302         lck_grp_t       *grp)
 303 {
 304         if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
 305                 return;
 306         lck->interlock = LCK_SPIN_TAG_DESTROYED;
 307         if (grp) {
 308                 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 309                 lck_grp_deallocate(grp);
 310         }
 311         return;
 312 }
 313
 314 /*
 315  *      Routine:        lck_spin_lock
 316  */
 317 void
 318 lck_spin_lock_grp(
 319         lck_spin_t      *lck,
 320         lck_grp_t       *grp)
 321 {
 322 #pragma unused(grp)
 323         usimple_lock((usimple_lock_t) lck, grp);
 324 }
 325
 326 void
 327 lck_spin_lock(
 328         lck_spin_t      *lck)
 329 {
 330         usimple_lock((usimple_lock_t) lck, NULL);
 331 }
 332
 333 /*
 334  *      Routine:        lck_spin_unlock
 335  */
 336 void
 337 lck_spin_unlock(
 338         lck_spin_t      *lck)
 339 {
 340         usimple_unlock((usimple_lock_t) lck);
 341 }
 342
 343 boolean_t
 344 lck_spin_try_lock_grp(
 345         lck_spin_t      *lck,
 346         lck_grp_t       *grp)
 347 {
 348 #pragma unused(grp)
 349         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
 350 #if     DEVELOPMENT || DEBUG
 351         if (lrval) {
 352                 pltrace(FALSE);
 353         }
 354 #endif
 355         return(lrval);
 356 }
 357
 358
 359 /*
 360  *      Routine:        lck_spin_try_lock
 361  */
 362 boolean_t
 363 lck_spin_try_lock(
 364         lck_spin_t      *lck)
 365 {
 366         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
 367 #if     DEVELOPMENT || DEBUG
 368         if (lrval) {
 369                 pltrace(FALSE);
 370         }
 371 #endif
 372         return(lrval);
 373 }
 374
 375 /*
 376  *      Routine:        lck_spin_assert
 377  */
 378 void
 379 lck_spin_assert(lck_spin_t *lock, unsigned int type)
 380 {
 381         thread_t thread, holder;
 382         uintptr_t state;
 383
 384         if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
 385                 panic("lck_spin_assert(): invalid arg (%u)", type);
 386         }
 387
 388         state = lock->interlock;
 389         holder = (thread_t)state;
 390         thread = current_thread();
 391         if (type == LCK_ASSERT_OWNED) {
 392                 if (__improbable(holder == THREAD_NULL)) {
 393                         panic("Lock not owned %p = %lx", lock, state);
 394                 }
 395                 if (__improbable(holder != thread)) {
 396                         panic("Lock not owned by current thread %p = %lx", lock, state);
 397                 }
 398         } else if (type == LCK_ASSERT_NOTOWNED) {
 399                 if (__improbable(holder != THREAD_NULL)) {
 400                         if (holder == thread) {
 401                                 panic("Lock owned by current thread %p = %lx", lock, state);
 402                         }
 403                 }
 404         }
 405 }
 406
 407 /*
 408  *      Routine: kdp_lck_spin_is_acquired
 409  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 410  *      Returns: TRUE if lock is acquired.
 411  */
 412 boolean_t
 413 kdp_lck_spin_is_acquired(lck_spin_t *lck) {
 414         if (not_in_kdp) {
 415                 panic("panic: spinlock acquired check done outside of kernel debugger");
 416         }
 417         return (lck->interlock != 0)? TRUE : FALSE;
 418 }
 419
 420 /*
 421  *      Initialize a usimple_lock.
 422  *
 423  *      No change in preemption state.
 424  */
 425 void
 426 usimple_lock_init(
 427         usimple_lock_t  l,
 428         __unused unsigned short tag)
 429 {
 430 #ifndef MACHINE_SIMPLE_LOCK
 431         USLDBG(usld_lock_init(l, tag));
 432         hw_lock_init(&l->interlock);
 433 #else
 434         simple_lock_init((simple_lock_t)l,tag);
 435 #endif
 436 }
 437
 438 volatile uint32_t spinlock_owner_cpu = ~0;
 439 volatile usimple_lock_t spinlock_timed_out;
 440
 441 uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
 442         uint32_t i;
 443
 444         for (i = 0; i < real_ncpus; i++) {
 445                 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
 446                         spinlock_owner_cpu = i;
 447                         if ((uint32_t) cpu_number() != i) {
 448                                 /* Cause NMI and panic on the owner's cpu */
 449                                 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
 450                         }
 451                         break;
 452                 }
 453         }
 454
 455         return spinlock_owner_cpu;
 456 }
 457
 458 /*
 459  *      Acquire a usimple_lock.
 460  *
 461  *      Returns with preemption disabled.  Note
 462  *      that the hw_lock routines are responsible for
 463  *      maintaining preemption state.
 464  */
 465 void
 466 (usimple_lock)(
 467         usimple_lock_t  l
 468         LCK_GRP_ARG(lck_grp_t *grp))
 469 {
 470 #ifndef MACHINE_SIMPLE_LOCK
 471         DECL_PC(pc);
 472
 473         OBTAIN_PC(pc);
 474         USLDBG(usld_lock_pre(l, pc));
 475
 476         if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0))   {
 477                 boolean_t uslock_acquired = FALSE;
 478                 while (machine_timeout_suspended()) {
 479                         enable_preemption();
 480                         if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp)))
 481                                 break;
 482                 }
 483
 484                 if (uslock_acquired == FALSE) {
 485                         uint32_t lock_cpu;
 486                         uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
 487                         spinlock_timed_out = l;
 488                         lock_cpu = spinlock_timeout_NMI(lowner);
 489                         panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
 490                               l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
 491                 }
 492         }
 493 #if DEVELOPMENT || DEBUG
 494                 pltrace(FALSE);
 495 #endif
 496
 497         USLDBG(usld_lock_post(l, pc));
 498 #else
 499         simple_lock((simple_lock_t)l, grp);
 500 #endif
 501 #if CONFIG_DTRACE
 502         LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
 503 #endif
 504 }
 505
 506
 507 /*
 508  *      Release a usimple_lock.
 509  *
 510  *      Returns with preemption enabled.  Note
 511  *      that the hw_lock routines are responsible for
 512  *      maintaining preemption state.
 513  */
 514 void
 515 usimple_unlock(
 516         usimple_lock_t  l)
 517 {
 518 #ifndef MACHINE_SIMPLE_LOCK
 519         DECL_PC(pc);
 520
 521         OBTAIN_PC(pc);
 522         USLDBG(usld_unlock(l, pc));
 523 #if DEVELOPMENT || DEBUG
 524                 pltrace(TRUE);
 525 #endif
 526         hw_lock_unlock(&l->interlock);
 527 #else
 528         simple_unlock_rwmb((simple_lock_t)l);
 529 #endif
 530 }
 531
 532
 533 /*
 534  *      Conditionally acquire a usimple_lock.
 535  *
 536  *      On success, returns with preemption disabled.
 537  *      On failure, returns with preemption in the same state
 538  *      as when first invoked.  Note that the hw_lock routines
 539  *      are responsible for maintaining preemption state.
 540  *
 541  *      XXX No stats are gathered on a miss; I preserved this
 542  *      behavior from the original assembly-language code, but
 543  *      doesn't it make sense to log misses?  XXX
 544  */
 545 unsigned int
 546 usimple_lock_try(
 547         usimple_lock_t  l,
 548         lck_grp_t *grp)
 549 {
 550 #ifndef MACHINE_SIMPLE_LOCK
 551         unsigned int    success;
 552         DECL_PC(pc);
 553
 554         OBTAIN_PC(pc);
 555         USLDBG(usld_lock_try_pre(l, pc));
 556         if ((success = hw_lock_try(&l->interlock, grp))) {
 557 #if DEVELOPMENT || DEBUG
 558                 pltrace(FALSE);
 559 #endif
 560         USLDBG(usld_lock_try_post(l, pc));
 561         }
 562         return success;
 563 #else
 564         return(simple_lock_try((simple_lock_t)l, grp));
 565 #endif
 566 }
 567
 568 /*
 569  * Acquire a usimple_lock while polling for pending cpu signals
 570  * and spinning on a lock.
 571  *
 572  */
 573 unsigned int
 574 (usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
 575         uint64_t deadline
 576         LCK_GRP_ARG(lck_grp_t *grp))
 577 {
 578         boolean_t istate = ml_get_interrupts_enabled();
 579
 580         if (deadline < mach_absolute_time()) {
 581                 return 0;
 582         }
 583
 584         while (!simple_lock_try(l, grp)) {
 585                 if (!istate)
 586                         cpu_signal_handler(NULL);
 587
 588                 if (deadline < mach_absolute_time()) {
 589                         return 0;
 590                 }
 591
 592                 cpu_pause();
 593         }
 594
 595         return 1;
 596 }
 597
 598 void
 599 (usimple_lock_try_lock_loop)(usimple_lock_t l
 600         LCK_GRP_ARG(lck_grp_t *grp))
 601 {
 602         usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
 603 }
 604
 605 unsigned int
 606 (usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
 607         uint64_t duration
 608         LCK_GRP_ARG(lck_grp_t *grp))
 609 {
 610         uint64_t deadline;
 611         uint64_t base_at = mach_absolute_time();
 612         uint64_t duration_at;
 613
 614         nanoseconds_to_absolutetime(duration, &duration_at);
 615         deadline = base_at + duration_at;
 616         if (deadline < base_at) {
 617                 /* deadline has overflowed, make it saturate */
 618                 deadline = ULLONG_MAX;
 619         }
 620
 621         return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
 622 }
 623
 624 #if     USLOCK_DEBUG
 625 /*
 626  *      States of a usimple_lock.  The default when initializing
 627  *      a usimple_lock is setting it up for debug checking.
 628  */
 629 #define USLOCK_CHECKED          0x0001          /* lock is being checked */
 630 #define USLOCK_TAKEN            0x0002          /* lock has been taken */
 631 #define USLOCK_INIT             0xBAA0          /* lock has been initialized */
 632 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 633 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 634                                  ((l)->debug.state & USLOCK_CHECKED))
 635
 636 /*
 637  *      Initialize the debugging information contained
 638  *      in a usimple_lock.
 639  */
 640 void
 641 usld_lock_init(
 642         usimple_lock_t  l,
 643         __unused unsigned short tag)
 644 {
 645         if (l == USIMPLE_LOCK_NULL)
 646                 panic("lock initialization:  null lock pointer");
 647         l->lock_type = USLOCK_TAG;
 648         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 649         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 650         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 651         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 652         l->debug.duration[0] = l->debug.duration[1] = 0;
 653         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 654         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 655         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 656 }
 657
 658
 659 /*
 660  *      These checks apply to all usimple_locks, not just
 661  *      those with USLOCK_CHECKED turned on.
 662  */
 663 int
 664 usld_lock_common_checks(
 665         usimple_lock_t  l,
 666         char            *caller)
 667 {
 668         if (l == USIMPLE_LOCK_NULL)
 669                 panic("%s:  null lock pointer", caller);
 670         if (l->lock_type != USLOCK_TAG)
 671                 panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
 672         if (!(l->debug.state & USLOCK_INIT))
 673                 panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
 674         return USLOCK_CHECKING(l);
 675 }
 676
 677
 678 /*
 679  *      Debug checks on a usimple_lock just before attempting
 680  *      to acquire it.
 681  */
 682 /* ARGSUSED */
 683 void
 684 usld_lock_pre(
 685         usimple_lock_t  l,
 686         pc_t            pc)
 687 {
 688         char    caller[] = "usimple_lock";
 689
 690
 691         if (!usld_lock_common_checks(l, caller))
 692                 return;
 693
 694 /*
 695  *      Note that we have a weird case where we are getting a lock when we are]
 696  *      in the process of putting the system to sleep. We are running with no
 697  *      current threads, therefore we can't tell if we are trying to retake a lock
 698  *      we have or someone on the other processor has it.  Therefore we just
 699  *      ignore this test if the locking thread is 0.
 700  */
 701
 702         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 703             l->debug.lock_thread == (void *) current_thread()) {
 704                 printf("%s:  lock %p already locked (at %p) by",
 705                       caller, l, l->debug.lock_pc);
 706                 printf(" current thread %p (new attempt at pc %p)\n",
 707                        l->debug.lock_thread, pc);
 708                 panic("%s", caller);
 709         }
 710         mp_disable_preemption();
 711         mp_enable_preemption();
 712 }
 713
 714
 715 /*
 716  *      Debug checks on a usimple_lock just after acquiring it.
 717  *
 718  *      Pre-emption has been disabled at this point,
 719  *      so we are safe in using cpu_number.
 720  */
 721 void
 722 usld_lock_post(
 723         usimple_lock_t  l,
 724         pc_t            pc)
 725 {
 726         int     mycpu;
 727         char    caller[] = "successful usimple_lock";
 728
 729
 730         if (!usld_lock_common_checks(l, caller))
 731                 return;
 732
 733         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 734                 panic("%s:  lock %p became uninitialized",
 735                       caller, l);
 736         if ((l->debug.state & USLOCK_TAKEN))
 737                 panic("%s:  lock 0x%p became TAKEN by someone else",
 738                       caller, l);
 739
 740         mycpu = cpu_number();
 741         l->debug.lock_thread = (void *)current_thread();
 742         l->debug.state |= USLOCK_TAKEN;
 743         l->debug.lock_pc = pc;
 744         l->debug.lock_cpu = mycpu;
 745 }
 746
 747
 748 /*
 749  *      Debug checks on a usimple_lock just before
 750  *      releasing it.  Note that the caller has not
 751  *      yet released the hardware lock.
 752  *
 753  *      Preemption is still disabled, so there's
 754  *      no problem using cpu_number.
 755  */
 756 void
 757 usld_unlock(
 758         usimple_lock_t  l,
 759         pc_t            pc)
 760 {
 761         int     mycpu;
 762         char    caller[] = "usimple_unlock";
 763
 764
 765         if (!usld_lock_common_checks(l, caller))
 766                 return;
 767
 768         mycpu = cpu_number();
 769
 770         if (!(l->debug.state & USLOCK_TAKEN))
 771                 panic("%s:  lock 0x%p hasn't been taken",
 772                       caller, l);
 773         if (l->debug.lock_thread != (void *) current_thread())
 774                 panic("%s:  unlocking lock 0x%p, owned by thread %p",
 775                       caller, l, l->debug.lock_thread);
 776         if (l->debug.lock_cpu != mycpu) {
 777                 printf("%s:  unlocking lock 0x%p on cpu 0x%x",
 778                        caller, l, mycpu);
 779                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 780                 panic("%s", caller);
 781         }
 782
 783         l->debug.unlock_thread = l->debug.lock_thread;
 784         l->debug.lock_thread = INVALID_PC;
 785         l->debug.state &= ~USLOCK_TAKEN;
 786         l->debug.unlock_pc = pc;
 787         l->debug.unlock_cpu = mycpu;
 788 }
 789
 790
 791 /*
 792  *      Debug checks on a usimple_lock just before
 793  *      attempting to acquire it.
 794  *
 795  *      Preemption isn't guaranteed to be disabled.
 796  */
 797 void
 798 usld_lock_try_pre(
 799         usimple_lock_t  l,
 800         __unused pc_t   pc)
 801 {
 802         char    caller[] = "usimple_lock_try";
 803
 804         if (!usld_lock_common_checks(l, caller))
 805                 return;
 806 }
 807
 808
 809 /*
 810  *      Debug checks on a usimple_lock just after
 811  *      successfully attempting to acquire it.
 812  *
 813  *      Preemption has been disabled by the
 814  *      lock acquisition attempt, so it's safe
 815  *      to use cpu_number.
 816  */
 817 void
 818 usld_lock_try_post(
 819         usimple_lock_t  l,
 820         pc_t            pc)
 821 {
 822         int     mycpu;
 823         char    caller[] = "successful usimple_lock_try";
 824
 825         if (!usld_lock_common_checks(l, caller))
 826                 return;
 827
 828         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 829                 panic("%s:  lock 0x%p became uninitialized",
 830                       caller, l);
 831         if ((l->debug.state & USLOCK_TAKEN))
 832                 panic("%s:  lock 0x%p became TAKEN by someone else",
 833                       caller, l);
 834
 835         mycpu = cpu_number();
 836         l->debug.lock_thread = (void *) current_thread();
 837         l->debug.state |= USLOCK_TAKEN;
 838         l->debug.lock_pc = pc;
 839         l->debug.lock_cpu = mycpu;
 840 }
 841 #endif  /* USLOCK_DEBUG */
 842
 843 /*
 844  *      Routine:        lck_rw_alloc_init
 845  */
 846 lck_rw_t *
 847 lck_rw_alloc_init(
 848         lck_grp_t       *grp,
 849         lck_attr_t      *attr) {
 850         lck_rw_t        *lck;
 851
 852         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 853                 bzero(lck, sizeof(lck_rw_t));
 854                 lck_rw_init(lck, grp, attr);
 855         }
 856
 857         return(lck);
 858 }
 859
 860 /*
 861  *      Routine:        lck_rw_free
 862  */
 863 void
 864 lck_rw_free(
 865         lck_rw_t        *lck,
 866         lck_grp_t       *grp) {
 867         lck_rw_destroy(lck, grp);
 868         kfree(lck, sizeof(lck_rw_t));
 869 }
 870
 871 /*
 872  *      Routine:        lck_rw_init
 873  */
 874 void
 875 lck_rw_init(
 876         lck_rw_t        *lck,
 877         lck_grp_t       *grp,
 878         lck_attr_t      *attr)
 879 {
 880         lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
 881                                         attr : &LockDefaultLckAttr;
 882
 883         hw_lock_byte_init(&lck->lck_rw_interlock);
 884         lck->lck_rw_want_write = FALSE;
 885         lck->lck_rw_want_upgrade = FALSE;
 886         lck->lck_rw_shared_count = 0;
 887         lck->lck_rw_can_sleep = TRUE;
 888         lck->lck_r_waiting = lck->lck_w_waiting = 0;
 889         lck->lck_rw_tag = 0;
 890         lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
 891                                 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
 892
 893         lck_grp_reference(grp);
 894         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 895 }
 896
 897 /*
 898  *      Routine:        lck_rw_destroy
 899  */
 900 void
 901 lck_rw_destroy(
 902         lck_rw_t        *lck,
 903         lck_grp_t       *grp)
 904 {
 905         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
 906                 return;
 907 #if MACH_LDEBUG
 908         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 909 #endif
 910         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 911         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 912         lck_grp_deallocate(grp);
 913         return;
 914 }
 915
 916 /*
 917  *      Sleep locks.  These use the same data structure and algorithm
 918  *      as the spin locks, but the process sleeps while it is waiting
 919  *      for the lock.  These work on uniprocessor systems.
 920  */
 921
 922 #define DECREMENTER_TIMEOUT 1000000
 923
 924 /*
 925  * We disable interrupts while holding the RW interlock to prevent an
 926  * interrupt from exacerbating hold time.
 927  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 928  */
 929 static inline boolean_t
 930 lck_interlock_lock(lck_rw_t *lck)
 931 {
 932         boolean_t       istate;
 933
 934         istate = ml_set_interrupts_enabled(FALSE);
 935         hw_lock_byte_lock(&lck->lck_rw_interlock);
 936         return istate;
 937 }
 938
 939 static inline void
 940 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 941 {
 942         hw_lock_byte_unlock(&lck->lck_rw_interlock);
 943         ml_set_interrupts_enabled(istate);
 944 }
 945
 946 /*
 947  * This inline is used when busy-waiting for an rw lock.
 948  * If interrupts were disabled when the lock primitive was called,
 949  * we poll the IPI handler for pending tlb flushes.
 950  * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
 951  */
 952 static inline void
 953 lck_rw_lock_pause(boolean_t interrupts_enabled)
 954 {
 955         if (!interrupts_enabled)
 956                 handle_pending_TLB_flushes();
 957         cpu_pause();
 958 }
 959
 960 static inline boolean_t
 961 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
 962 {
 963         if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE))
 964                 return TRUE;
 965         return FALSE;
 966 }
 967
 968 /*
 969  * compute the deadline to spin against when
 970  * waiting for a change of state on a lck_rw_t
 971  */
 972 static inline uint64_t
 973 lck_rw_deadline_for_spin(lck_rw_t *lck)
 974 {
 975         if (lck->lck_rw_can_sleep) {
 976                 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
 977                         /*
 978                          * there are already threads waiting on this lock... this
 979                          * implies that they have spun beyond their deadlines waiting for
 980                          * the desired state to show up so we will not bother spinning at this time...
 981                          *   or
 982                          * the current number of threads sharing this lock exceeds our capacity to run them
 983                          * concurrently and since all states we're going to spin for require the rw_shared_count
 984                          * to be at 0, we'll not bother spinning since the latency for this to happen is
 985                          * unpredictable...
 986                          */
 987                         return (mach_absolute_time());
 988                 }
 989                 return (mach_absolute_time() + MutexSpin);
 990         } else
 991                 return (mach_absolute_time() + (100000LL * 1000000000LL));
 992 }
 993
 994
 995 /*
 996  * Spin while interlock is held.
 997  */
 998
 999 static inline void
1000 lck_rw_interlock_spin(lck_rw_t *lock)
1001 {
1002         while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
1003                 cpu_pause();
1004         }
1005 }
1006
1007 static boolean_t
1008 lck_rw_grab_want(lck_rw_t *lock)
1009 {
1010         uint32_t        data, prev;
1011
1012         for ( ; ; ) {
1013                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
1014                 if ((data & LCK_RW_INTERLOCK) == 0)
1015                         break;
1016                 atomic_exchange_abort();
1017                 lck_rw_interlock_spin(lock);
1018         }
1019         if (data & LCK_RW_WANT_WRITE) {
1020                 atomic_exchange_abort();
1021                 return FALSE;
1022         }
1023         data |= LCK_RW_WANT_WRITE;
1024         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1025 }
1026
1027 static boolean_t
1028 lck_rw_grab_shared(lck_rw_t *lock)
1029 {
1030         uint32_t        data, prev;
1031
1032         for ( ; ; ) {
1033                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1034                 if ((data & LCK_RW_INTERLOCK) == 0)
1035                         break;
1036                 atomic_exchange_abort();
1037                 lck_rw_interlock_spin(lock);
1038         }
1039         if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1040                 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1041                         atomic_exchange_abort();
1042                         return FALSE;
1043                 }
1044         }
1045         data += LCK_RW_SHARED_READER;
1046         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1047 }
1048
1049 /*
1050  *      Routine:        lck_rw_lock_exclusive
1051  */
1052 static void
1053 lck_rw_lock_exclusive_gen(
1054         lck_rw_t        *lck)
1055 {
1056         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1057         uint64_t        deadline = 0;
1058         int             slept = 0;
1059         int             gotlock = 0;
1060         int             lockheld = 0;
1061         wait_result_t   res = 0;
1062         boolean_t       istate = -1;
1063
1064 #if     CONFIG_DTRACE
1065         boolean_t dtrace_ls_initialized = FALSE;
1066         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
1067         uint64_t wait_interval = 0;
1068         int readers_at_sleep = 0;
1069 #endif
1070
1071         /*
1072          *      Try to acquire the lck_rw_want_write bit.
1073          */
1074         while ( !lck_rw_grab_want(lck)) {
1075
1076 #if     CONFIG_DTRACE
1077                 if (dtrace_ls_initialized == FALSE) {
1078                         dtrace_ls_initialized = TRUE;
1079                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1080                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1081                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1082                         if (dtrace_ls_enabled) {
1083                                 /*
1084                                  * Either sleeping or spinning is happening,
1085                                  *  start a timing of our delay interval now.
1086                                  */
1087                                 readers_at_sleep = lck->lck_rw_shared_count;
1088                                 wait_interval = mach_absolute_time();
1089                         }
1090                 }
1091 #endif
1092                 if (istate == -1)
1093                         istate = ml_get_interrupts_enabled();
1094
1095                 deadline = lck_rw_deadline_for_spin(lck);
1096
1097                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1098
1099                 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
1100                         lck_rw_lock_pause(istate);
1101
1102                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1103
1104                 if (gotlock)
1105                         break;
1106                 /*
1107                  * if we get here, the deadline has expired w/o us
1108                  * being able to grab the lock exclusively
1109                  * check to see if we're allowed to do a thread_block
1110                  */
1111                 if (lck->lck_rw_can_sleep) {
1112
1113                         istate = lck_interlock_lock(lck);
1114
1115                         if (lck->lck_rw_want_write) {
1116
1117                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1118
1119                                 lck->lck_w_waiting = TRUE;
1120
1121                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1122                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1123                                                 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1124                                 lck_interlock_unlock(lck, istate);
1125
1126                                 if (res == THREAD_WAITING) {
1127                                         res = thread_block(THREAD_CONTINUE_NULL);
1128                                         slept++;
1129                                 }
1130                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1131                         } else {
1132                                 lck->lck_rw_want_write = TRUE;
1133                                 lck_interlock_unlock(lck, istate);
1134                                 break;
1135                         }
1136                 }
1137         }
1138         /*
1139          * Wait for readers (and upgrades) to finish...
1140          * the test for these conditions must be done simultaneously with
1141          * a check of the interlock not being held since
1142          * the rw_shared_count will drop to 0 first and then want_upgrade
1143          * will be set to 1 in the shared_to_exclusive scenario... those
1144          * adjustments are done behind the interlock and represent an
1145          * atomic change in state and must be considered as such
1146          * however, once we see the read count at 0, the want_upgrade not set
1147          * and the interlock not held, we are safe to proceed
1148          */
1149         while (lck_rw_held_read_or_upgrade(lck)) {
1150
1151 #if     CONFIG_DTRACE
1152                 /*
1153                  * Either sleeping or spinning is happening, start
1154                  * a timing of our delay interval now.  If we set it
1155                  * to -1 we don't have accurate data so we cannot later
1156                  * decide to record a dtrace spin or sleep event.
1157                  */
1158                 if (dtrace_ls_initialized == FALSE) {
1159                         dtrace_ls_initialized = TRUE;
1160                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1161                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1162                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1163                         if (dtrace_ls_enabled) {
1164                                 /*
1165                                  * Either sleeping or spinning is happening,
1166                                  *  start a timing of our delay interval now.
1167                                  */
1168                                 readers_at_sleep = lck->lck_rw_shared_count;
1169                                 wait_interval = mach_absolute_time();
1170                         }
1171                 }
1172 #endif
1173                 if (istate == -1)
1174                         istate = ml_get_interrupts_enabled();
1175
1176                 deadline = lck_rw_deadline_for_spin(lck);
1177
1178                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1179
1180                 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1181                         lck_rw_lock_pause(istate);
1182
1183                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1184
1185                 if ( !lockheld)
1186                         break;
1187                 /*
1188                  * if we get here, the deadline has expired w/o us
1189                  * being able to grab the lock exclusively
1190                  * check to see if we're allowed to do a thread_block
1191                  */
1192                 if (lck->lck_rw_can_sleep) {
1193
1194                         istate = lck_interlock_lock(lck);
1195
1196                         if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1197                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1198
1199                                 lck->lck_w_waiting = TRUE;
1200
1201                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1202                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1203                                                 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1204                                 lck_interlock_unlock(lck, istate);
1205
1206                                 if (res == THREAD_WAITING) {
1207                                         res = thread_block(THREAD_CONTINUE_NULL);
1208                                         slept++;
1209                                 }
1210                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1211                         } else {
1212                                 lck_interlock_unlock(lck, istate);
1213                                 /*
1214                                  * must own the lock now, since we checked for
1215                                  * readers or upgrade owner behind the interlock
1216                                  * no need for a call to 'lck_rw_held_read_or_upgrade'
1217                                  */
1218                                 break;
1219                         }
1220                 }
1221         }
1222
1223 #if     CONFIG_DTRACE
1224         /*
1225          * Decide what latencies we suffered that are Dtrace events.
1226          * If we have set wait_interval, then we either spun or slept.
1227          * At least we get out from under the interlock before we record
1228          * which is the best we can do here to minimize the impact
1229          * of the tracing.
1230          * If we have set wait_interval to -1, then dtrace was not enabled when we
1231          * started sleeping/spinning so we don't record this event.
1232          */
1233         if (dtrace_ls_enabled == TRUE) {
1234                 if (slept == 0) {
1235                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1236                             mach_absolute_time() - wait_interval, 1);
1237                 } else {
1238                         /*
1239                          * For the blocking case, we also record if when we blocked
1240                          * it was held for read or write, and how many readers.
1241                          * Notice that above we recorded this before we dropped
1242                          * the interlock so the count is accurate.
1243                          */
1244                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1245                             mach_absolute_time() - wait_interval, 1,
1246                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1247                 }
1248         }
1249         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1250 #endif
1251 }
1252
1253 /*
1254  *      Routine:        lck_rw_done
1255  */
1256
1257 lck_rw_type_t lck_rw_done(lck_rw_t *lock)
1258 {
1259         uint32_t        data, prev;
1260
1261         for ( ; ; ) {
1262                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1263                 if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
1264                         atomic_exchange_abort();
1265                         lck_rw_interlock_spin(lock);
1266                         continue;
1267                 }
1268                 if (data & LCK_RW_SHARED_MASK) {
1269                         data -= LCK_RW_SHARED_READER;
1270                         if ((data & LCK_RW_SHARED_MASK) == 0)   /* if reader count has now gone to 0, check for waiters */
1271                                 goto check_waiters;
1272                 } else {                                        /* if reader count == 0, must be exclusive lock */
1273                         if (data & LCK_RW_WANT_UPGRADE) {
1274                                 data &= ~(LCK_RW_WANT_UPGRADE);
1275                         } else {
1276                                 if (data & LCK_RW_WANT_WRITE)
1277                                         data &= ~(LCK_RW_WANT_EXCL);
1278                                 else                                    /* lock is not 'owned', panic */
1279                                         panic("Releasing non-exclusive RW lock without a reader refcount!");
1280                         }
1281 check_waiters:
1282                         if (prev & LCK_RW_W_WAITING) {
1283                                 data &= ~(LCK_RW_W_WAITING);
1284                                 if ((prev & LCK_RW_PRIV_EXCL) == 0)
1285                                         data &= ~(LCK_RW_R_WAITING);
1286                         } else
1287                                 data &= ~(LCK_RW_R_WAITING);
1288                 }
1289                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1290                         break;
1291                 cpu_pause();
1292         }
1293         return lck_rw_done_gen(lock, prev);
1294 }
1295
1296 /*
1297  *      Routine:        lck_rw_done_gen
1298  *
1299  *      called from lck_rw_done()
1300  *      prior_lock_state is the value in the 1st
1301  *      word of the lock at the time of a successful
1302  *      atomic compare and exchange with the new value...
1303  *      it represents the state of the lock before we
1304  *      decremented the rw_shared_count or cleared either
1305  *      rw_want_upgrade or rw_want_write and
1306  *      the lck_x_waiting bits...  since the wrapper
1307  *      routine has already changed the state atomically,
1308  *      we just need to decide if we should
1309  *      wake up anyone and what value to return... we do
1310  *      this by examining the state of the lock before
1311  *      we changed it
1312  */
1313 static lck_rw_type_t
1314 lck_rw_done_gen(
1315         lck_rw_t        *lck,
1316         uint32_t        prior_lock_state)
1317 {
1318         lck_rw_t        *fake_lck;
1319         lck_rw_type_t   lock_type;
1320         thread_t        thread;
1321         uint32_t        rwlock_count;
1322
1323         thread = current_thread();
1324         rwlock_count = thread->rwlock_count--;
1325         fake_lck = (lck_rw_t *)&prior_lock_state;
1326
1327         if (lck->lck_rw_can_sleep) {
1328                 /*
1329                  * prior_lock state is a snapshot of the 1st word of the
1330                  * lock in question... we'll fake up a pointer to it
1331                  * and carefully not access anything beyond whats defined
1332                  * in the first word of a lck_rw_t
1333                  */
1334
1335                 if (fake_lck->lck_rw_shared_count <= 1) {
1336                         if (fake_lck->lck_w_waiting) {
1337                                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1338                         }
1339
1340                         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1341                                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1342                         }
1343                 }
1344 #if MACH_LDEBUG
1345                 if (rwlock_count == 0) {
1346                         panic("rw lock count underflow for thread %p", thread);
1347                 }
1348 #endif
1349                 /* Check if dropping the lock means that we need to unpromote */
1350
1351                 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1352                         /* sched_flags checked without lock, but will be rechecked while clearing */
1353                         lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1354                 }
1355         }
1356         if (fake_lck->lck_rw_shared_count) {
1357                 lock_type = LCK_RW_TYPE_SHARED;
1358         } else {
1359                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1360         }
1361
1362 #if CONFIG_DTRACE
1363         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1364 #endif
1365
1366         return lock_type;
1367 }
1368
1369
1370 /*
1371  *      Routine:        lck_rw_unlock
1372  */
1373 void
1374 lck_rw_unlock(
1375         lck_rw_t        *lck,
1376         lck_rw_type_t   lck_rw_type)
1377 {
1378         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1379                 lck_rw_unlock_shared(lck);
1380         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1381                 lck_rw_unlock_exclusive(lck);
1382         else
1383                 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1384 }
1385
1386
1387 /*
1388  *      Routine:        lck_rw_unlock_shared
1389  */
1390 void
1391 lck_rw_unlock_shared(
1392         lck_rw_t        *lck)
1393 {
1394         lck_rw_type_t   ret;
1395
1396         assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1397         ret = lck_rw_done(lck);
1398
1399         if (ret != LCK_RW_TYPE_SHARED)
1400                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1401 }
1402
1403
1404 /*
1405  *      Routine:        lck_rw_unlock_exclusive
1406  */
1407 void
1408 lck_rw_unlock_exclusive(
1409         lck_rw_t        *lck)
1410 {
1411         lck_rw_type_t   ret;
1412
1413         ret = lck_rw_done(lck);
1414
1415         if (ret != LCK_RW_TYPE_EXCLUSIVE)
1416                 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1417 }
1418
1419
1420 /*
1421  *      Routine:        lck_rw_lock
1422  */
1423 void
1424 lck_rw_lock(
1425         lck_rw_t        *lck,
1426         lck_rw_type_t   lck_rw_type)
1427 {
1428         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1429                 lck_rw_lock_shared(lck);
1430         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1431                 lck_rw_lock_exclusive(lck);
1432         else
1433                 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1434 }
1435
1436 /*
1437  *      Routine:        lck_rw_lock_shared
1438  */
1439 void
1440 lck_rw_lock_shared(lck_rw_t *lock)
1441 {
1442         uint32_t        data, prev;
1443
1444         current_thread()->rwlock_count++;
1445         for ( ; ; ) {
1446                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1447                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1448                         atomic_exchange_abort();
1449                         if (lock->lck_rw_can_sleep) {
1450                                 lck_rw_lock_shared_gen(lock);
1451                         } else {
1452                                 cpu_pause();
1453                                 continue;
1454                         }
1455                         break;
1456                 }
1457                 data += LCK_RW_SHARED_READER;
1458                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1459                         break;
1460                 cpu_pause();
1461         }
1462 #if     CONFIG_DTRACE
1463         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1464 #endif  /* CONFIG_DTRACE */
1465         return;
1466 }
1467
1468 /*
1469  *      Routine:        lck_rw_lock_shared_gen
1470  *      Function:
1471  *              assembly fast path code has determined that this lock
1472  *              is held exclusively... this is where we spin/block
1473  *              until we can acquire the lock in the shared mode
1474  */
1475 static void
1476 lck_rw_lock_shared_gen(
1477         lck_rw_t        *lck)
1478 {
1479         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1480         uint64_t        deadline = 0;
1481         int             gotlock = 0;
1482         int             slept = 0;
1483         wait_result_t   res = 0;
1484         boolean_t       istate = -1;
1485
1486 #if     CONFIG_DTRACE
1487         uint64_t wait_interval = 0;
1488         int readers_at_sleep = 0;
1489         boolean_t dtrace_ls_initialized = FALSE;
1490         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1491 #endif
1492
1493         while ( !lck_rw_grab_shared(lck)) {
1494
1495 #if     CONFIG_DTRACE
1496                 if (dtrace_ls_initialized == FALSE) {
1497                         dtrace_ls_initialized = TRUE;
1498                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1499                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1500                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1501                         if (dtrace_ls_enabled) {
1502                                 /*
1503                                  * Either sleeping or spinning is happening,
1504                                  *  start a timing of our delay interval now.
1505                                  */
1506                                 readers_at_sleep = lck->lck_rw_shared_count;
1507                                 wait_interval = mach_absolute_time();
1508                         }
1509                 }
1510 #endif
1511                 if (istate == -1)
1512                         istate = ml_get_interrupts_enabled();
1513
1514                 deadline = lck_rw_deadline_for_spin(lck);
1515
1516                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1517                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1518
1519                 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1520                         lck_rw_lock_pause(istate);
1521
1522                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1523                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1524
1525                 if (gotlock)
1526                         break;
1527                 /*
1528                  * if we get here, the deadline has expired w/o us
1529                  * being able to grab the lock for read
1530                  * check to see if we're allowed to do a thread_block
1531                  */
1532                 if (lck->lck_rw_can_sleep) {
1533
1534                         istate = lck_interlock_lock(lck);
1535
1536                         if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1537                             ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1538
1539                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1540                                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1541
1542                                 lck->lck_r_waiting = TRUE;
1543
1544                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1545                                 res = assert_wait(RW_LOCK_READER_EVENT(lck),
1546                                                 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1547                                 lck_interlock_unlock(lck, istate);
1548
1549                                 if (res == THREAD_WAITING) {
1550                                         res = thread_block(THREAD_CONTINUE_NULL);
1551                                         slept++;
1552                                 }
1553                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1554                                              trace_lck, res, slept, 0, 0);
1555                         } else {
1556                                 lck->lck_rw_shared_count++;
1557                                 lck_interlock_unlock(lck, istate);
1558                                 break;
1559                         }
1560                 }
1561         }
1562
1563 #if     CONFIG_DTRACE
1564         if (dtrace_ls_enabled == TRUE) {
1565                 if (slept == 0) {
1566                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1567                 } else {
1568                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1569                             mach_absolute_time() - wait_interval, 0,
1570                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1571                 }
1572         }
1573         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1574 #endif
1575 }
1576
1577
1578 /*
1579  *      Routine:        lck_rw_lock_exclusive
1580  */
1581
1582 void
1583 lck_rw_lock_exclusive(lck_rw_t *lock)
1584 {
1585         current_thread()->rwlock_count++;
1586         if (atomic_test_and_set32(&lock->data,
1587                 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1588                 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1589 #if     CONFIG_DTRACE
1590                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1591 #endif  /* CONFIG_DTRACE */
1592         } else
1593                 lck_rw_lock_exclusive_gen(lock);
1594 }
1595
1596
1597 /*
1598  *      Routine:        lck_rw_lock_shared_to_exclusive
1599  *
1600  *      False returned upon failure, in this case the shared lock is dropped.
1601  */
1602
1603 boolean_t
1604 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1605 {
1606         uint32_t        data, prev;
1607
1608         for ( ; ; ) {
1609                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1610                 if (data & LCK_RW_INTERLOCK) {
1611                         atomic_exchange_abort();
1612                         lck_rw_interlock_spin(lock);
1613                         continue;
1614                 }
1615                 if (data & LCK_RW_WANT_UPGRADE) {
1616                         data -= LCK_RW_SHARED_READER;
1617                         if ((data & LCK_RW_SHARED_MASK) == 0)           /* we were the last reader */
1618                                 data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1619                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1620                                 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1621                 } else {
1622                         data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1623                         data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1624                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1625                                 break;
1626                 }
1627                 cpu_pause();
1628         }
1629                                                 /* we now own the WANT_UPGRADE */
1630         if (data & LCK_RW_SHARED_MASK)          /* check to see if all of the readers are drained */
1631                 lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1632 #if     CONFIG_DTRACE
1633         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1634 #endif
1635         return TRUE;
1636 }
1637
1638
1639 /*
1640  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1641  *      Function:
1642  *              assembly fast path code has already dropped our read
1643  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1644  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1645  *              all we need to do here is determine if a wakeup is needed
1646  */
1647 static boolean_t
1648 lck_rw_lock_shared_to_exclusive_failure(
1649         lck_rw_t        *lck,
1650         uint32_t        prior_lock_state)
1651 {
1652         lck_rw_t        *fake_lck;
1653         thread_t        thread = current_thread();
1654         uint32_t        rwlock_count;
1655
1656         /* Check if dropping the lock means that we need to unpromote */
1657         rwlock_count = thread->rwlock_count--;
1658 #if MACH_LDEBUG
1659         if (rwlock_count == 0) {
1660                 panic("rw lock count underflow for thread %p", thread);
1661         }
1662 #endif
1663         fake_lck = (lck_rw_t *)&prior_lock_state;
1664
1665         if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1666                 /*
1667                  *      Someone else has requested upgrade.
1668                  *      Since we've released the read lock, wake
1669                  *      him up if he's blocked waiting
1670                  */
1671                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1672         }
1673
1674         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1675                 /* sched_flags checked without lock, but will be rechecked while clearing */
1676                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1677         }
1678
1679         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1680                      VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1681
1682         return (FALSE);
1683 }
1684
1685
1686 /*
1687  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1688  *      Function:
1689  *              assembly fast path code has already dropped our read
1690  *              count and successfully acquired 'lck_rw_want_upgrade'
1691  *              we just need to wait for the rest of the readers to drain
1692  *              and then we can return as the exclusive holder of this lock
1693  */
1694 static boolean_t
1695 lck_rw_lock_shared_to_exclusive_success(
1696         lck_rw_t        *lck)
1697 {
1698         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1699         uint64_t        deadline = 0;
1700         int             slept = 0;
1701         int             still_shared = 0;
1702         wait_result_t   res;
1703         boolean_t       istate = -1;
1704
1705 #if     CONFIG_DTRACE
1706         uint64_t wait_interval = 0;
1707         int readers_at_sleep = 0;
1708         boolean_t dtrace_ls_initialized = FALSE;
1709         boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1710 #endif
1711
1712         while (lck->lck_rw_shared_count != 0) {
1713
1714 #if     CONFIG_DTRACE
1715                 if (dtrace_ls_initialized == FALSE) {
1716                         dtrace_ls_initialized = TRUE;
1717                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1718                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1719                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1720                         if (dtrace_ls_enabled) {
1721                                 /*
1722                                  * Either sleeping or spinning is happening,
1723                                  *  start a timing of our delay interval now.
1724                                  */
1725                                 readers_at_sleep = lck->lck_rw_shared_count;
1726                                 wait_interval = mach_absolute_time();
1727                         }
1728                 }
1729 #endif
1730                 if (istate == -1)
1731                         istate = ml_get_interrupts_enabled();
1732
1733                 deadline = lck_rw_deadline_for_spin(lck);
1734
1735                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1736                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1737
1738                 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1739                         lck_rw_lock_pause(istate);
1740
1741                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1742                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1743
1744                 if ( !still_shared)
1745                         break;
1746                 /*
1747                  * if we get here, the deadline has expired w/o
1748                  * the rw_shared_count having drained to 0
1749                  * check to see if we're allowed to do a thread_block
1750                  */
1751                 if (lck->lck_rw_can_sleep) {
1752
1753                         istate = lck_interlock_lock(lck);
1754
1755                         if (lck->lck_rw_shared_count != 0) {
1756                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1757                                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1758
1759                                 lck->lck_w_waiting = TRUE;
1760
1761                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1762                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1763                                                 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1764                                 lck_interlock_unlock(lck, istate);
1765
1766                                 if (res == THREAD_WAITING) {
1767                                         res = thread_block(THREAD_CONTINUE_NULL);
1768                                         slept++;
1769                                 }
1770                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1771                                              trace_lck, res, slept, 0, 0);
1772                         } else {
1773                                 lck_interlock_unlock(lck, istate);
1774                                 break;
1775                         }
1776                 }
1777         }
1778 #if     CONFIG_DTRACE
1779         /*
1780          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1781          */
1782         if (dtrace_ls_enabled == TRUE) {
1783                 if (slept == 0) {
1784                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1785                 } else {
1786                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1787                             mach_absolute_time() - wait_interval, 1,
1788                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1789                 }
1790         }
1791         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1792 #endif
1793         return (TRUE);
1794 }
1795
1796 /*
1797  *      Routine:        lck_rw_lock_exclusive_to_shared
1798  */
1799
1800 void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1801 {
1802         uint32_t        data, prev;
1803
1804         for ( ; ; ) {
1805                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1806                 if (data & LCK_RW_INTERLOCK) {
1807                         atomic_exchange_abort();
1808                         lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1809                         continue;
1810                 }
1811                 data += LCK_RW_SHARED_READER;
1812                 if (data & LCK_RW_WANT_UPGRADE)
1813                         data &= ~(LCK_RW_WANT_UPGRADE);
1814                 else
1815                         data &= ~(LCK_RW_WANT_EXCL);
1816                 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL)))
1817                         data &= ~(LCK_RW_W_WAITING);
1818                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1819                         break;
1820                 cpu_pause();
1821         }
1822         return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1823 }
1824
1825
1826 /*
1827  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1828  *      Function:
1829  *              assembly fast path has already dropped
1830  *              our exclusive state and bumped lck_rw_shared_count
1831  *              all we need to do here is determine if anyone
1832  *              needs to be awakened.
1833  */
1834 static void
1835 lck_rw_lock_exclusive_to_shared_gen(
1836         lck_rw_t        *lck,
1837         uint32_t        prior_lock_state)
1838 {
1839         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1840         lck_rw_t                *fake_lck;
1841
1842         fake_lck = (lck_rw_t *)&prior_lock_state;
1843
1844         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1845                              trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1846
1847         /*
1848          * don't wake up anyone waiting to take the lock exclusively
1849          * since we hold a read count... when the read count drops to 0,
1850          * the writers will be woken.
1851          *
1852          * wake up any waiting readers if we don't have any writers waiting,
1853          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1854          */
1855         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1856                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1857
1858         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1859                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1860
1861 #if CONFIG_DTRACE
1862         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1863 #endif
1864 }
1865
1866
1867 /*
1868  *      Routine:        lck_rw_try_lock
1869  */
1870 boolean_t
1871 lck_rw_try_lock(
1872         lck_rw_t        *lck,
1873         lck_rw_type_t   lck_rw_type)
1874 {
1875         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1876                 return(lck_rw_try_lock_shared(lck));
1877         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1878                 return(lck_rw_try_lock_exclusive(lck));
1879         else
1880                 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1881         return(FALSE);
1882 }
1883
1884 /*
1885  *      Routine:        lck_rw_try_lock_shared
1886  */
1887
1888 boolean_t lck_rw_try_lock_shared(lck_rw_t *lock)
1889 {
1890         uint32_t        data, prev;
1891
1892         for ( ; ; ) {
1893                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1894                 if (data & LCK_RW_INTERLOCK) {
1895                         atomic_exchange_abort();
1896                         lck_rw_interlock_spin(lock);
1897                         continue;
1898                 }
1899                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1900                         atomic_exchange_abort();
1901                         return FALSE;                   /* lock is busy */
1902                 }
1903                 data += LCK_RW_SHARED_READER;           /* Increment reader refcount */
1904                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1905                         break;
1906                 cpu_pause();
1907         }
1908         current_thread()->rwlock_count++;
1909         /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1910 #if     CONFIG_DTRACE
1911         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1912 #endif  /* CONFIG_DTRACE */
1913         return TRUE;
1914 }
1915
1916
1917 /*
1918  *      Routine:        lck_rw_try_lock_exclusive
1919  */
1920
1921 boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock)
1922 {
1923         uint32_t        data, prev;
1924
1925         for ( ; ; ) {
1926                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1927                 if (data & LCK_RW_INTERLOCK) {
1928                         atomic_exchange_abort();
1929                         lck_rw_interlock_spin(lock);
1930                         continue;
1931                 }
1932                 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1933                         atomic_exchange_abort();
1934                         return FALSE;                           /* can't get it */
1935                 }
1936                 data |= LCK_RW_WANT_EXCL;
1937                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1938                         break;
1939                 cpu_pause();
1940         }
1941
1942         current_thread()->rwlock_count++;
1943 #if     CONFIG_DTRACE
1944         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1945 #endif  /* CONFIG_DTRACE */
1946         return TRUE;
1947 }
1948
1949
1950 void
1951 lck_rw_assert(
1952         lck_rw_t        *lck,
1953         unsigned int    type)
1954 {
1955         switch (type) {
1956         case LCK_RW_ASSERT_SHARED:
1957                 if (lck->lck_rw_shared_count != 0) {
1958                         return;
1959                 }
1960                 break;
1961         case LCK_RW_ASSERT_EXCLUSIVE:
1962                 if ((lck->lck_rw_want_write ||
1963                      lck->lck_rw_want_upgrade) &&
1964                     lck->lck_rw_shared_count == 0) {
1965                         return;
1966                 }
1967                 break;
1968         case LCK_RW_ASSERT_HELD:
1969                 if (lck->lck_rw_want_write ||
1970                     lck->lck_rw_want_upgrade ||
1971                     lck->lck_rw_shared_count != 0) {
1972                         return;
1973                 }
1974                 break;
1975         case LCK_RW_ASSERT_NOTHELD:
1976                 if (!(lck->lck_rw_want_write ||
1977                           lck->lck_rw_want_upgrade ||
1978                           lck->lck_rw_shared_count != 0)) {
1979                         return;
1980                 }
1981                 break;
1982         default:
1983                 break;
1984         }
1985
1986         panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1987 }
1988
1989 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1990 #if MACH_LDEBUG
1991 __dead2
1992 #endif
1993 void
1994 lck_rw_clear_promotions_x86(thread_t thread)
1995 {
1996 #if MACH_LDEBUG
1997         /* It's fatal to leave a RW lock locked and return to userspace */
1998         panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1999 #else
2000         /* Paper over the issue */
2001         thread->rwlock_count = 0;
2002         lck_rw_clear_promotion(thread, 0);
2003 #endif
2004 }
2005
2006 boolean_t
2007 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2008 {
2009         lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2010
2011         if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
2012                 lck_rw_unlock_shared(lck);
2013                 mutex_pause(2);
2014                 lck_rw_lock_shared(lck);
2015                 return TRUE;
2016         }
2017
2018         return FALSE;
2019 }
2020
2021 /*
2022  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2023  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2024  */
2025 boolean_t
2026 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
2027         if (not_in_kdp) {
2028                 panic("panic: rw lock exclusive check done outside of kernel debugger");
2029         }
2030         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2031 }
2032
2033 /*
2034  * Slow path routines for lck_mtx locking and unlocking functions.
2035  *
2036  * These functions were previously implemented in x86 assembly,
2037  * and some optimizations are in place in this c code to obtain a compiled code
2038  * as performant and compact as the assembly version.
2039  *
2040  * To avoid to inline these functions on the fast path, all functions directly called by
2041  * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2042  * in such a way the fast path can tail call into them. In this way the return address
2043  * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2044  *
2045  * Slow path code is structured in such a way there are no calls to functions that will return
2046  * on the context of the caller function, i.e. all functions called are or tail call functions
2047  * or inline functions. The number of arguments of the tail call functions are less then six,
2048  * so that they can be passed over registers and do not need to be pushed on stack.
2049  * This allows the compiler to not create a stack frame for the functions.
2050  *
2051  * __improbable and __probable are used to compile the slow path code in such a way
2052  * the fast path case will be on a sequence of instructions with as less jumps as possible,
2053  * to make this case the most optimized even if falling through the slow path.
2054  */
2055
2056 /*
2057  * Intel lock invariants:
2058  *
2059  * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2060  *
2061  * The lock owner is promoted to the max priority of all its waiters only if it
2062  * was a lower priority when it acquired or was an owner when a waiter waited.
2063  * Max priority is capped at MAXPRI_PROMOTE.
2064  *
2065  * The last waiter will not be promoted as it is woken up, but the last
2066  * lock owner may not have been the last thread to have been woken up depending on the
2067  * luck of the draw.  Therefore a last-owner may still have the promoted-on-wakeup
2068  * flag set.
2069  *
2070  * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2071  *       priority from dropping priority in the future without having to take thread lock
2072  *       on acquire.
2073  */
2074
2075 #ifdef  MUTEX_ZONE
2076 extern zone_t lck_mtx_zone;
2077 #endif
2078
2079 /*
2080  *      Routine:        lck_mtx_alloc_init
2081  */
2082 lck_mtx_t *
2083 lck_mtx_alloc_init(
2084         lck_grp_t       *grp,
2085         lck_attr_t      *attr)
2086 {
2087         lck_mtx_t       *lck;
2088 #ifdef  MUTEX_ZONE
2089         if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
2090                 lck_mtx_init(lck, grp, attr);
2091 #else
2092         if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
2093                 lck_mtx_init(lck, grp, attr);
2094 #endif
2095         return(lck);
2096 }
2097
2098 /*
2099  *      Routine:        lck_mtx_free
2100  */
2101 void
2102 lck_mtx_free(
2103         lck_mtx_t       *lck,
2104         lck_grp_t       *grp)
2105 {
2106         lck_mtx_destroy(lck, grp);
2107 #ifdef  MUTEX_ZONE
2108         zfree(lck_mtx_zone, lck);
2109 #else
2110         kfree(lck, sizeof(lck_mtx_t));
2111 #endif
2112 }
2113
2114 /*
2115  *      Routine:        lck_mtx_ext_init
2116  */
2117 static void
2118 lck_mtx_ext_init(
2119         lck_mtx_ext_t   *lck,
2120         lck_grp_t       *grp,
2121         lck_attr_t      *attr)
2122 {
2123         bzero((void *)lck, sizeof(lck_mtx_ext_t));
2124
2125         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2126                 lck->lck_mtx_deb.type = MUTEX_TAG;
2127                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2128         }
2129
2130         lck->lck_mtx_grp = grp;
2131
2132         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
2133                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2134
2135         lck->lck_mtx.lck_mtx_is_ext = 1;
2136         lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
2137 }
2138
2139 /*
2140  *      Routine:        lck_mtx_init
2141  */
2142 void
2143 lck_mtx_init(
2144         lck_mtx_t       *lck,
2145         lck_grp_t       *grp,
2146         lck_attr_t      *attr)
2147 {
2148         lck_mtx_ext_t   *lck_ext;
2149         lck_attr_t      *lck_attr;
2150
2151         if (attr != LCK_ATTR_NULL)
2152                 lck_attr = attr;
2153         else
2154                 lck_attr = &LockDefaultLckAttr;
2155
2156         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2157                 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2158                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
2159                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2160                         lck->lck_mtx_ptr = lck_ext;
2161                 }
2162         } else {
2163                 lck->lck_mtx_owner = 0;
2164                 lck->lck_mtx_state = 0;
2165         }
2166         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2167         lck_grp_reference(grp);
2168         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2169 }
2170
2171 /*
2172  *      Routine:        lck_mtx_init_ext
2173  */
2174 void
2175 lck_mtx_init_ext(
2176         lck_mtx_t       *lck,
2177         lck_mtx_ext_t   *lck_ext,
2178         lck_grp_t       *grp,
2179         lck_attr_t      *attr)
2180 {
2181         lck_attr_t      *lck_attr;
2182
2183         if (attr != LCK_ATTR_NULL)
2184                 lck_attr = attr;
2185         else
2186                 lck_attr = &LockDefaultLckAttr;
2187
2188         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2189                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2190                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2191                 lck->lck_mtx_ptr = lck_ext;
2192         } else {
2193                 lck->lck_mtx_owner = 0;
2194                 lck->lck_mtx_state = 0;
2195         }
2196         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2197
2198         lck_grp_reference(grp);
2199         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2200 }
2201
2202 static void
2203 lck_mtx_lock_mark_destroyed(
2204         lck_mtx_t *mutex,
2205         boolean_t indirect)
2206 {
2207         uint32_t state;
2208
2209         if (indirect) {
2210                 /* convert to destroyed state */
2211                 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2212                 return;
2213         }
2214
2215         state = ordered_load_mtx_state(mutex);
2216         lck_mtx_interlock_lock(mutex, &state);
2217
2218         ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2219
2220         enable_preemption();
2221 }
2222
2223 /*
2224  *      Routine:        lck_mtx_destroy
2225  */
2226 void
2227 lck_mtx_destroy(
2228         lck_mtx_t       *lck,
2229         lck_grp_t       *grp)
2230 {
2231         boolean_t indirect;
2232
2233         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
2234                 return;
2235 #if MACH_LDEBUG
2236         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2237 #endif
2238         indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2239
2240         lck_mtx_lock_mark_destroyed(lck, indirect);
2241
2242         if (indirect)
2243                 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2244         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2245         lck_grp_deallocate(grp);
2246         return;
2247 }
2248
2249
2250 #if DEVELOPMENT | DEBUG
2251 __attribute__((noinline))
2252 void
2253 lck_mtx_owner_check_panic(
2254         lck_mtx_t       *lock)
2255 {
2256         thread_t owner = (thread_t)lock->lck_mtx_owner;
2257         panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2258 }
2259 #endif
2260
2261 __attribute__((always_inline))
2262 static boolean_t
2263 get_indirect_mutex(
2264         lck_mtx_t       **lock,
2265         uint32_t        *state)
2266 {
2267         *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2268         *state = ordered_load_mtx_state(*lock);
2269         return TRUE;
2270 }
2271
2272 /*
2273  * Routine:     lck_mtx_unlock_slow
2274  *
2275  * Unlocks a mutex held by current thread.
2276  *
2277  * It will wake up waiters if necessary.
2278  *
2279  * Interlock can be held.
2280  */
2281 __attribute__((noinline))
2282 void
2283 lck_mtx_unlock_slow(
2284         lck_mtx_t       *lock)
2285 {
2286         thread_t        thread;
2287         uint32_t        state, prev;
2288         boolean_t       indirect = FALSE;
2289
2290         state = ordered_load_mtx_state(lock);
2291
2292         /* Is this an indirect mutex? */
2293         if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2294                 indirect = get_indirect_mutex(&lock, &state);
2295         }
2296
2297         thread = current_thread();
2298
2299 #if DEVELOPMENT | DEBUG
2300         thread_t owner = (thread_t)lock->lck_mtx_owner;
2301         if(__improbable(owner != thread))
2302                 lck_mtx_owner_check_panic(lock);
2303 #endif
2304
2305         /* check if it is held as a spinlock */
2306         if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0))
2307                 goto unlock;
2308
2309         lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2310
2311 unlock:
2312         /* preemption disabled, interlock held and mutex not held */
2313
2314         /* clear owner */
2315         ordered_store_mtx_owner(lock, 0);
2316         /* keep original state in prev for later evaluation */
2317         prev = state;
2318
2319         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2320 #if     MACH_LDEBUG
2321                 if (thread)
2322                         thread->mutex_count--;
2323 #endif
2324                 return lck_mtx_unlock_wakeup_tail(lock, state, indirect);
2325         }
2326
2327         /* release interlock, promotion and clear spin flag */
2328         state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
2329         ordered_store_mtx_state_release(lock, state);           /* since I own the interlock, I don't need an atomic update */
2330
2331 #if     MACH_LDEBUG
2332         /* perform lock statistics after drop to prevent delay */
2333         if (thread)
2334                 thread->mutex_count--;          /* lock statistic */
2335 #endif  /* MACH_LDEBUG */
2336
2337         /* re-enable preemption */
2338         lck_mtx_unlock_finish_inline(lock, FALSE);
2339
2340         return;
2341 }
2342
2343 #define LCK_MTX_LCK_WAIT_CODE           0x20
2344 #define LCK_MTX_LCK_WAKEUP_CODE         0x21
2345 #define LCK_MTX_LCK_SPIN_CODE           0x22
2346 #define LCK_MTX_LCK_ACQUIRE_CODE        0x23
2347 #define LCK_MTX_LCK_DEMOTE_CODE         0x24
2348
2349 /*
2350  * Routine:    lck_mtx_unlock_wakeup_tail
2351  *
2352  * Invoked on unlock when there is
2353  * contention, i.e. the assembly routine sees
2354  * that mutex->lck_mtx_waiters != 0
2355  *
2356  * neither the mutex or interlock is held
2357  *
2358  * Note that this routine might not be called if there are pending
2359  * waiters which have previously been woken up, and they didn't
2360  * end up boosting the old owner.
2361  *
2362  * assembly routine previously did the following to mutex:
2363  * (after saving the state in prior_lock_state)
2364  *      decremented lck_mtx_waiters if nonzero
2365  *
2366  * This function needs to be called as a tail call
2367  * to optimize the compiled code.
2368  */
2369 __attribute__((noinline))
2370 static void
2371 lck_mtx_unlock_wakeup_tail (
2372         lck_mtx_t       *mutex,
2373         uint32_t        state,
2374         boolean_t       indirect)
2375 {
2376         struct turnstile *ts;
2377
2378         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2379         kern_return_t did_wake;
2380
2381         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
2382                 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2383
2384         ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
2385
2386         if (mutex->lck_mtx_waiters > 1) {
2387                 /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
2388                 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
2389         } else {
2390                 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2391                 turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE);
2392         }
2393         assert(did_wake == KERN_SUCCESS);
2394
2395         turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2396         turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2397
2398         state -= LCK_MTX_WAITER;
2399         state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
2400         ordered_store_mtx_state_release(mutex, state);
2401
2402         assert(current_thread()->turnstile != NULL);
2403
2404         turnstile_cleanup();
2405
2406         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
2407                  trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2408
2409         lck_mtx_unlock_finish_inline(mutex, indirect);
2410 }
2411
2412 /*
2413  * Routine:     lck_mtx_lock_acquire_x86
2414  *
2415  * Invoked on acquiring the mutex when there is
2416  * contention (i.e. the assembly routine sees that
2417  * that mutex->lck_mtx_waiters != 0
2418  *
2419  * mutex is owned...  interlock is held... preemption is disabled
2420  */
2421 __attribute__((always_inline))
2422 static void
2423 lck_mtx_lock_acquire_inline(
2424         lck_mtx_t       *mutex,
2425         struct turnstile *ts)
2426 {
2427         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2428
2429         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
2430                      trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2431
2432         thread_t thread = (thread_t)mutex->lck_mtx_owner;       /* faster than current_thread() */
2433         assert(thread->waiting_for_mutex == NULL);
2434
2435         if (mutex->lck_mtx_waiters > 0) {
2436                 if (ts == NULL) {
2437                         ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
2438                 }
2439
2440                 turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
2441                 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2442         }
2443
2444         if (ts != NULL) {
2445                 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2446         }
2447
2448         assert(current_thread()->turnstile != NULL);
2449
2450         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
2451                      trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2452 }
2453
2454 void
2455 lck_mtx_lock_acquire_x86(
2456         lck_mtx_t       *mutex)
2457 {
2458         return lck_mtx_lock_acquire_inline(mutex, NULL);
2459 }
2460
2461 /*
2462  * Tail call helpers for lock functions that perform
2463  * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2464  * the caller's compiled code.
2465  */
2466
2467 __attribute__((noinline))
2468 static void
2469 lck_mtx_lock_acquire_tail(
2470         lck_mtx_t       *mutex,
2471         boolean_t       indirect,
2472         struct turnstile *ts)
2473 {
2474         lck_mtx_lock_acquire_inline(mutex, ts);
2475         lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex), indirect);
2476 }
2477
2478 __attribute__((noinline))
2479 static boolean_t
2480 lck_mtx_try_lock_acquire_tail(
2481         lck_mtx_t       *mutex)
2482 {
2483         lck_mtx_lock_acquire_inline(mutex, NULL);
2484         lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2485
2486         return TRUE;
2487 }
2488
2489 __attribute__((noinline))
2490 static void
2491 lck_mtx_convert_spin_acquire_tail(
2492         lck_mtx_t       *mutex)
2493 {
2494         lck_mtx_lock_acquire_inline(mutex, NULL);
2495         lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2496 }
2497
2498 boolean_t
2499 lck_mtx_ilk_unlock(
2500         lck_mtx_t       *mutex)
2501 {
2502         lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2503         return TRUE;
2504 }
2505
2506 static inline void
2507 lck_mtx_interlock_lock_set_and_clear_flags(
2508         lck_mtx_t *mutex,
2509         uint32_t xor_flags,
2510         uint32_t and_flags,
2511         uint32_t *new_state)
2512 {
2513         uint32_t state, prev;
2514         state = *new_state;
2515
2516         for ( ; ; ) {
2517                 /* have to wait for interlock to clear */
2518                 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2519                         cpu_pause();
2520                         state = ordered_load_mtx_state(mutex);
2521                 }
2522                 prev = state;                                   /* prev contains snapshot for exchange */
2523                 state |= LCK_MTX_ILOCKED_MSK | xor_flags;       /* pick up interlock */
2524                 state &= ~and_flags;                            /* clear flags */
2525
2526                 disable_preemption();
2527                 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire))
2528                         break;
2529                 enable_preemption();
2530                 cpu_pause();
2531                 state = ordered_load_mtx_state(mutex);
2532         }
2533         *new_state = state;
2534         return;
2535 }
2536
2537 static inline void
2538 lck_mtx_interlock_lock_clear_flags(
2539         lck_mtx_t *mutex,
2540         uint32_t and_flags,
2541         uint32_t *new_state)
2542 {
2543         return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2544 }
2545
2546 static inline void
2547 lck_mtx_interlock_lock(
2548         lck_mtx_t *mutex,
2549         uint32_t *new_state)
2550 {
2551         return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2552 }
2553
2554 static inline int
2555 lck_mtx_interlock_try_lock_set_flags(
2556         lck_mtx_t *mutex,
2557         uint32_t or_flags,
2558         uint32_t *new_state)
2559 {
2560         uint32_t state, prev;
2561         state = *new_state;
2562
2563         /* have to wait for interlock to clear */
2564         if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2565                 return 0;
2566         }
2567         prev = state;                                   /* prev contains snapshot for exchange */
2568         state |= LCK_MTX_ILOCKED_MSK | or_flags;        /* pick up interlock */
2569         disable_preemption();
2570         if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
2571                         *new_state = state;
2572                         return 1;
2573         }
2574
2575         enable_preemption();
2576         return 0;
2577 }
2578
2579 static inline int
2580 lck_mtx_interlock_try_lock(
2581         lck_mtx_t *mutex,
2582         uint32_t *new_state)
2583 {
2584         return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state);
2585 }
2586
2587 static inline int
2588 lck_mtx_interlock_try_lock_disable_interrupts(
2589         lck_mtx_t *mutex,
2590         boolean_t *istate)
2591 {
2592         uint32_t        state;
2593
2594         *istate = ml_set_interrupts_enabled(FALSE);
2595         state = ordered_load_mtx_state(mutex);
2596
2597         if (lck_mtx_interlock_try_lock(mutex, &state)) {
2598                 return 1;
2599         } else {
2600                 ml_set_interrupts_enabled(*istate);
2601                 return 0;
2602         }
2603 }
2604
2605 static inline void
2606 lck_mtx_interlock_unlock_enable_interrupts(
2607         lck_mtx_t *mutex,
2608         boolean_t istate)
2609 {
2610         lck_mtx_ilk_unlock(mutex);
2611         ml_set_interrupts_enabled(istate);
2612 }
2613
2614 __attribute__((noinline))
2615 static void
2616 lck_mtx_lock_contended(
2617         lck_mtx_t       *lock,
2618         boolean_t indirect,
2619         boolean_t *first_miss)
2620 {
2621         lck_mtx_spinwait_ret_type_t ret;
2622         uint32_t state;
2623         thread_t thread;
2624         struct turnstile *ts = NULL;
2625
2626 try_again:
2627
2628         if (indirect) {
2629                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
2630         }
2631
2632         ret = lck_mtx_lock_spinwait_x86(lock);
2633         state = ordered_load_mtx_state(lock);
2634         switch (ret) {
2635         case LCK_MTX_SPINWAIT_NO_SPIN:
2636                 /*
2637                  * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2638                  * try to spin.
2639                  */
2640                 if (indirect) {
2641                         lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
2642                 }
2643
2644                 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2645         case LCK_MTX_SPINWAIT_SPUN:
2646                 /*
2647                  * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2648                  * interlock not held
2649                  */
2650                 lck_mtx_interlock_lock(lock, &state);
2651                 assert(state & LCK_MTX_ILOCKED_MSK);
2652
2653                 if (state & LCK_MTX_MLOCKED_MSK) {
2654                         if (indirect) {
2655                                 lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
2656                         }
2657                         lck_mtx_lock_wait_x86(lock, &ts);
2658                         /*
2659                          * interlock is not held here.
2660                          */
2661                         goto try_again;
2662                 } else {
2663
2664                         /* grab the mutex */
2665                         state |= LCK_MTX_MLOCKED_MSK;
2666                         ordered_store_mtx_state_release(lock, state);
2667                         thread = current_thread();
2668                         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2669 #if     MACH_LDEBUG
2670                         if (thread) {
2671                                 thread->mutex_count++;
2672                         }
2673 #endif  /* MACH_LDEBUG */
2674                 }
2675
2676                 break;
2677         case LCK_MTX_SPINWAIT_ACQUIRED:
2678                 /*
2679                  * mutex has been acquired by lck_mtx_lock_spinwait_x86
2680                  * interlock is held and preemption disabled
2681                  * owner is set and mutex marked as locked
2682                  * statistics updated too
2683                  */
2684                 break;
2685         default:
2686                 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2687         }
2688
2689         /*
2690          * interlock is already acquired here
2691          */
2692
2693         /* mutex has been acquired */
2694         thread = (thread_t)lock->lck_mtx_owner;
2695         if (state & LCK_MTX_WAITERS_MSK) {
2696                 /*
2697                  * lck_mtx_lock_acquire_tail will call
2698                  * turnstile_complete.
2699                  */
2700                 return lck_mtx_lock_acquire_tail(lock, indirect, ts);
2701         }
2702
2703         if (ts != NULL) {
2704                 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2705         }
2706
2707         assert(current_thread()->turnstile != NULL);
2708
2709         /* release the interlock */
2710         lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock), indirect);
2711 }
2712
2713 /*
2714  * Helper noinline functions for calling
2715  * panic to optimize compiled code.
2716  */
2717
2718 __attribute__((noinline)) __abortlike
2719 static void
2720 lck_mtx_destroyed(
2721         lck_mtx_t       *lock)
2722 {
2723         panic("trying to interlock destroyed mutex (%p)", lock);
2724 }
2725
2726 __attribute__((noinline))
2727 static boolean_t
2728 lck_mtx_try_destroyed(
2729         lck_mtx_t       *lock)
2730 {
2731         panic("trying to interlock destroyed mutex (%p)", lock);
2732         return FALSE;
2733 }
2734
2735 __attribute__((always_inline))
2736 static boolean_t
2737 lck_mtx_lock_wait_interlock_to_clear(
2738         lck_mtx_t       *lock,
2739         uint32_t*        new_state)
2740 {
2741         uint32_t state;
2742
2743         for ( ; ; ) {
2744                 cpu_pause();
2745                 state = ordered_load_mtx_state(lock);
2746                 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2747                         *new_state = state;
2748                         return TRUE;
2749                 }
2750                 if (state & LCK_MTX_MLOCKED_MSK) {
2751                         /* if it is held as mutex, just fail */
2752                         return FALSE;
2753                 }
2754         }
2755 }
2756
2757 __attribute__((always_inline))
2758 static boolean_t
2759 lck_mtx_try_lock_wait_interlock_to_clear(
2760         lck_mtx_t       *lock,
2761         uint32_t*        new_state)
2762 {
2763         uint32_t state;
2764
2765         for ( ; ; ) {
2766                 cpu_pause();
2767                 state = ordered_load_mtx_state(lock);
2768                 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2769                         /* if it is held as mutex or spin, just fail */
2770                         return FALSE;
2771                 }
2772                 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2773                         *new_state = state;
2774                         return TRUE;
2775                 }
2776         }
2777 }
2778
2779 /*
2780  * Routine:     lck_mtx_lock_slow
2781  *
2782  * Locks a mutex for current thread.
2783  * If the lock is contended this function might
2784  * sleep.
2785  *
2786  * Called with interlock not held.
2787  */
2788 __attribute__((noinline))
2789 void
2790 lck_mtx_lock_slow(
2791         lck_mtx_t       *lock)
2792 {
2793         boolean_t       indirect = FALSE;
2794         uint32_t        state;
2795         int             first_miss = 0;
2796
2797         state = ordered_load_mtx_state(lock);
2798
2799         /* is the interlock or mutex held */
2800         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2801                 /*
2802                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2803                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2804                  * set in state (state == lck_mtx_tag)
2805                  */
2806
2807
2808                 /* is the mutex already held and not indirect */
2809                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2810                         /* no, must have been the mutex */
2811                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2812                 }
2813
2814                 /* check to see if it is marked destroyed */
2815                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2816                         lck_mtx_destroyed(lock);
2817                 }
2818
2819                 /* Is this an indirect mutex? */
2820                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2821                         indirect = get_indirect_mutex(&lock, &state);
2822
2823                         first_miss = 0;
2824                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2825
2826                         if (state & LCK_MTX_SPIN_MSK) {
2827                                  /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2828                                 assert(state & LCK_MTX_ILOCKED_MSK);
2829                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2830                         }
2831                 }
2832
2833                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2834                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2835                 }
2836         }
2837
2838         /* no - can't be INDIRECT, DESTROYED or locked */
2839         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2840                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2841                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2842                 }
2843         }
2844
2845         /* lock and interlock acquired */
2846
2847         thread_t thread = current_thread();
2848         /* record owner of mutex */
2849         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2850
2851 #if MACH_LDEBUG
2852         if (thread) {
2853                 thread->mutex_count++;          /* lock statistic */
2854         }
2855 #endif
2856         /*
2857          * Check if there are waiters to
2858          * inherit their priority.
2859          */
2860         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2861                 return lck_mtx_lock_acquire_tail(lock, indirect, NULL);
2862         }
2863
2864         /* release the interlock */
2865         lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2866
2867         return;
2868 }
2869
2870 __attribute__((noinline))
2871 boolean_t
2872 lck_mtx_try_lock_slow(
2873         lck_mtx_t       *lock)
2874 {
2875         boolean_t       indirect = FALSE;
2876         uint32_t        state;
2877         int             first_miss = 0;
2878
2879         state = ordered_load_mtx_state(lock);
2880
2881         /* is the interlock or mutex held */
2882         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2883                 /*
2884                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2885                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2886                  * set in state (state == lck_mtx_tag)
2887                  */
2888
2889                 /* is the mutex already held and not indirect */
2890                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2891                         return FALSE;
2892                 }
2893
2894                 /* check to see if it is marked destroyed */
2895                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2896                         lck_mtx_try_destroyed(lock);
2897                 }
2898
2899                 /* Is this an indirect mutex? */
2900                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2901                         indirect = get_indirect_mutex(&lock, &state);
2902
2903                         first_miss = 0;
2904                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2905                 }
2906
2907                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
2908                         if (indirect)
2909                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2910                         return FALSE;
2911                 }
2912         }
2913
2914         /* no - can't be INDIRECT, DESTROYED or locked */
2915         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2916                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
2917                         if (indirect)
2918                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2919                         return FALSE;
2920                 }
2921         }
2922
2923         /* lock and interlock acquired */
2924
2925         thread_t thread = current_thread();
2926         /* record owner of mutex */
2927         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2928
2929 #if MACH_LDEBUG
2930         if (thread) {
2931                 thread->mutex_count++;          /* lock statistic */
2932         }
2933 #endif
2934         /*
2935          * Check if there are waiters to
2936          * inherit their priority.
2937          */
2938         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2939                 return lck_mtx_try_lock_acquire_tail(lock);
2940         }
2941
2942         /* release the interlock */
2943         lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
2944
2945         return TRUE;
2946
2947 }
2948
2949 __attribute__((noinline))
2950 void
2951 lck_mtx_lock_spin_slow(
2952         lck_mtx_t       *lock)
2953 {
2954         boolean_t       indirect = FALSE;
2955         uint32_t        state;
2956         int             first_miss = 0;
2957
2958         state = ordered_load_mtx_state(lock);
2959
2960         /* is the interlock or mutex held */
2961         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2962                 /*
2963                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2964                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2965                  * set in state (state == lck_mtx_tag)
2966                  */
2967
2968
2969                 /* is the mutex already held and not indirect */
2970                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2971                         /* no, must have been the mutex */
2972                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2973                 }
2974
2975                 /* check to see if it is marked destroyed */
2976                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2977                         lck_mtx_destroyed(lock);
2978                 }
2979
2980                 /* Is this an indirect mutex? */
2981                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2982                         indirect = get_indirect_mutex(&lock, &state);
2983
2984                         first_miss = 0;
2985                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2986
2987                         if (state & LCK_MTX_SPIN_MSK) {
2988                                  /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2989                                 assert(state & LCK_MTX_ILOCKED_MSK);
2990                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2991                         }
2992                 }
2993
2994                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2995                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2996                 }
2997         }
2998
2999         /* no - can't be INDIRECT, DESTROYED or locked */
3000         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) {
3001                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3002                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3003                 }
3004         }
3005
3006         /* lock as spinlock and interlock acquired */
3007
3008         thread_t thread = current_thread();
3009         /* record owner of mutex */
3010         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3011
3012 #if MACH_LDEBUG
3013         if (thread) {
3014                 thread->mutex_count++;          /* lock statistic */
3015         }
3016 #endif
3017
3018 #if     CONFIG_DTRACE
3019         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3020 #endif
3021         /* return with the interlock held and preemption disabled */
3022         return;
3023 }
3024
3025 __attribute__((noinline))
3026 boolean_t
3027 lck_mtx_try_lock_spin_slow(
3028         lck_mtx_t       *lock)
3029 {
3030         boolean_t       indirect = FALSE;
3031         uint32_t        state;
3032         int             first_miss = 0;
3033
3034         state = ordered_load_mtx_state(lock);
3035
3036         /* is the interlock or mutex held */
3037         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3038                 /*
3039                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3040                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3041                  * set in state (state == lck_mtx_tag)
3042                  */
3043
3044                 /* is the mutex already held and not indirect */
3045                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
3046                         return FALSE;
3047                 }
3048
3049                 /* check to see if it is marked destroyed */
3050                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3051                         lck_mtx_try_destroyed(lock);
3052                 }
3053
3054                 /* Is this an indirect mutex? */
3055                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3056                         indirect = get_indirect_mutex(&lock, &state);
3057
3058                         first_miss = 0;
3059                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3060                 }
3061
3062                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3063                         if (indirect)
3064                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3065                         return FALSE;
3066                 }
3067         }
3068
3069         /* no - can't be INDIRECT, DESTROYED or locked */
3070         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3071                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3072                         if (indirect)
3073                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3074                         return FALSE;
3075                 }
3076         }
3077
3078         /* lock and interlock acquired */
3079
3080         thread_t thread = current_thread();
3081         /* record owner of mutex */
3082         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3083
3084 #if MACH_LDEBUG
3085         if (thread) {
3086                 thread->mutex_count++;          /* lock statistic */
3087         }
3088 #endif
3089
3090 #if     CONFIG_DTRACE
3091         LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3092 #endif
3093         return TRUE;
3094
3095 }
3096
3097 __attribute__((noinline))
3098 void
3099 lck_mtx_convert_spin(
3100         lck_mtx_t       *lock)
3101 {
3102         uint32_t state;
3103
3104         state = ordered_load_mtx_state(lock);
3105
3106         /* Is this an indirect mutex? */
3107         if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3108                 /* If so, take indirection */
3109                 get_indirect_mutex(&lock, &state);
3110         }
3111
3112         assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3113
3114         if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3115                 /* already owned as a mutex, just return */
3116                 return;
3117         }
3118
3119         assert(get_preemption_level() > 0);
3120         assert(state & LCK_MTX_ILOCKED_MSK);
3121         assert(state & LCK_MTX_SPIN_MSK);
3122
3123         /*
3124          * Check if there are waiters to
3125          * inherit their priority.
3126          */
3127         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3128                 return lck_mtx_convert_spin_acquire_tail(lock);
3129         }
3130
3131         lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3132
3133         return;
3134 }
3135
3136 static inline boolean_t
3137 lck_mtx_lock_grab_mutex(
3138         lck_mtx_t       *lock)
3139 {
3140         uint32_t state;
3141
3142         state = ordered_load_mtx_state(lock);
3143
3144         if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3145                 return FALSE;
3146         }
3147
3148         /* lock and interlock acquired */
3149
3150         thread_t thread = current_thread();
3151         /* record owner of mutex */
3152         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3153
3154 #if MACH_LDEBUG
3155         if (thread) {
3156                 thread->mutex_count++;          /* lock statistic */
3157         }
3158 #endif
3159         return TRUE;
3160 }
3161
3162 __attribute__((noinline))
3163 void
3164 lck_mtx_assert(
3165         lck_mtx_t       *lock,
3166         unsigned int    type)
3167 {
3168         thread_t thread, owner;
3169         uint32_t state;
3170
3171         thread = current_thread();
3172         state = ordered_load_mtx_state(lock);
3173
3174         if (state == LCK_MTX_TAG_INDIRECT) {
3175                 get_indirect_mutex(&lock, &state);
3176         }
3177
3178         owner = (thread_t)lock->lck_mtx_owner;
3179
3180         if (type == LCK_MTX_ASSERT_OWNED) {
3181                 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))
3182                         panic("mutex (%p) not owned\n", lock);
3183         } else {
3184                 assert (type == LCK_MTX_ASSERT_NOTOWNED);
3185                 if (owner == thread)
3186                         panic("mutex (%p) owned\n", lock);
3187         }
3188 }
3189
3190 /*
3191  * Routine:     lck_mtx_lock_spinwait_x86
3192  *
3193  * Invoked trying to acquire a mutex when there is contention but
3194  * the holder is running on another processor. We spin for up to a maximum
3195  * time waiting for the lock to be released.
3196  *
3197  * Called with the interlock unlocked.
3198  * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3199  * returns LCK_MTX_SPINWAIT_SPUN if we spun
3200  * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3201  */
3202 __attribute__((noinline))
3203 lck_mtx_spinwait_ret_type_t
3204 lck_mtx_lock_spinwait_x86(
3205         lck_mtx_t       *mutex)
3206 {
3207         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3208         thread_t        holder;
3209         uint64_t        overall_deadline;
3210         uint64_t        check_owner_deadline;
3211         uint64_t        cur_time;
3212         lck_mtx_spinwait_ret_type_t             retval = LCK_MTX_SPINWAIT_SPUN;
3213         int             loopcount = 0;
3214
3215         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
3216                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
3217
3218         cur_time = mach_absolute_time();
3219         overall_deadline = cur_time + MutexSpin;
3220         check_owner_deadline = cur_time;
3221
3222         /*
3223          * Spin while:
3224          *   - mutex is locked, and
3225          *   - its locked as a spin lock, and
3226          *   - owner is running on another processor, and
3227          *   - owner (processor) is not idling, and
3228          *   - we haven't spun for long enough.
3229          */
3230         do {
3231                 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
3232                         retval = LCK_MTX_SPINWAIT_ACQUIRED;
3233                         break;
3234                 }
3235                 cur_time = mach_absolute_time();
3236
3237                 if (cur_time >= overall_deadline)
3238                         break;
3239
3240                 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
3241                         boolean_t       istate;
3242
3243                         /*
3244                          * We will repeatedly peek at the state of the lock while spinning,
3245                          * and we will acquire the interlock to do so.
3246                          * The thread that will unlock the mutex will also need to acquire
3247                          * the interlock, and we want to avoid to slow it down.
3248                          * To avoid to get an interrupt while holding the interlock
3249                          * and increase the time we are holding it, we
3250                          * will try to acquire the interlock with interrupts disabled.
3251                          * This is safe because it is a "try_lock", if we can't acquire
3252                          * the interlock we re-enable the interrupts and fail, so it is
3253                          * ok to call it even if the interlock was already held.
3254                         */
3255                         if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) {
3256
3257                                 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
3258
3259                                         if ( !(holder->machine.specFlags & OnProc) ||
3260                                              (holder->state & TH_IDLE)) {
3261
3262                                                 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3263
3264                                                 if (loopcount == 0)
3265                                                         retval = LCK_MTX_SPINWAIT_NO_SPIN;
3266                                                 break;
3267                                         }
3268                                 }
3269                                 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3270
3271                                 check_owner_deadline = cur_time + (MutexSpin / 4);
3272                         }
3273                 }
3274                 cpu_pause();
3275
3276                 loopcount++;
3277
3278         } while (TRUE);
3279
3280 #if     CONFIG_DTRACE
3281         /*
3282          * We've already kept a count via overall_deadline of how long we spun.
3283          * If dtrace is active, then we compute backwards to decide how
3284          * long we spun.
3285          *
3286          * Note that we record a different probe id depending on whether
3287          * this is a direct or indirect mutex.  This allows us to
3288          * penalize only lock groups that have debug/stats enabled
3289          * with dtrace processing if desired.
3290          */
3291         if (__probable(mutex->lck_mtx_is_ext == 0)) {
3292                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3293                         mach_absolute_time() - (overall_deadline - MutexSpin));
3294         } else {
3295                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3296                         mach_absolute_time() - (overall_deadline - MutexSpin));
3297         }
3298         /* The lockstat acquire event is recorded by the assembly code beneath us. */
3299 #endif
3300
3301         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
3302                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
3303
3304         return retval;
3305 }
3306
3307
3308
3309 /*
3310  * Routine:     lck_mtx_lock_wait_x86
3311  *
3312  * Invoked in order to wait on contention.
3313  *
3314  * Called with the interlock locked and
3315  * preemption disabled...
3316  * returns it unlocked and with preemption enabled
3317  *
3318  * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3319  *      A runnable waiter can exist between wait and acquire
3320  *      without a waiters count being set.
3321  *      This allows us to never make a spurious wakeup call.
3322  *
3323  * Priority:
3324  *      This avoids taking the thread lock if the owning thread is the same priority.
3325  *      This optimizes the case of same-priority threads contending on a lock.
3326  *      However, that allows the owning thread to drop in priority while holding the lock,
3327  *      because there is no state that the priority change can notice that
3328  *      says that the targeted thread holds a contended mutex.
3329  *
3330  *      One possible solution: priority changes could look for some atomic tag
3331  *      on the thread saying 'holding contended lock', and then set up a promotion.
3332  *      Needs a story for dropping that promotion - the last contended unlock
3333  *      has to notice that this has happened.
3334  */
3335 __attribute__((noinline))
3336 void
3337 lck_mtx_lock_wait_x86 (
3338         lck_mtx_t       *mutex,
3339         struct turnstile **ts)
3340 {
3341         thread_t self = current_thread();
3342
3343 #if     CONFIG_DTRACE
3344         uint64_t sleep_start = 0;
3345
3346         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3347                 sleep_start = mach_absolute_time();
3348         }
3349 #endif
3350         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3351
3352         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
3353                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3354                      mutex->lck_mtx_waiters, 0, 0);
3355
3356         assert(self->waiting_for_mutex == NULL);
3357         self->waiting_for_mutex = mutex;
3358         mutex->lck_mtx_waiters++;
3359
3360         thread_t holder = (thread_t)mutex->lck_mtx_owner;
3361         assert(holder != NULL);
3362
3363         /*
3364          * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
3365          * the same turnstile while looping, the matching turnstile compleate will be called
3366          * by lck_mtx_lock_contended when finally acquiring the lock.
3367          */
3368         if (*ts == NULL) {
3369                 *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
3370         }
3371
3372         struct turnstile *turnstile = *ts;
3373         thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3374         turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
3375
3376         waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
3377
3378         lck_mtx_ilk_unlock(mutex);
3379
3380         turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
3381
3382         thread_block(THREAD_CONTINUE_NULL);
3383
3384         self->waiting_for_mutex = NULL;
3385
3386         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
3387                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3388                      mutex->lck_mtx_waiters, 0, 0);
3389
3390 #if     CONFIG_DTRACE
3391         /*
3392          * Record the Dtrace lockstat probe for blocking, block time
3393          * measured from when we were entered.
3394          */
3395         if (sleep_start) {
3396                 if (mutex->lck_mtx_is_ext == 0) {
3397                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3398                             mach_absolute_time() - sleep_start);
3399                 } else {
3400                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3401                             mach_absolute_time() - sleep_start);
3402                 }
3403         }
3404 #endif
3405 }
3406
3407 /*
3408  *      Routine: kdp_lck_mtx_lock_spin_is_acquired
3409  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3410  *      Returns: TRUE if lock is acquired.
3411  */
3412 boolean_t
3413 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t     *lck)
3414 {
3415         if (not_in_kdp) {
3416                 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3417         }
3418
3419         if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3420                 return TRUE;
3421         }
3422
3423         return FALSE;
3424 }
3425
3426 void
3427 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3428 {
3429         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3430         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3431         thread_t holder   = (thread_t)mutex->lck_mtx_owner;
3432         waitinfo->owner   = thread_tid(holder);
3433 }
3434
3435 void
3436 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3437 {
3438         lck_rw_t *rwlck = NULL;
3439         switch(waitinfo->wait_type) {
3440                 case kThreadWaitKernelRWLockRead:
3441                         rwlck = READ_EVENT_TO_RWLOCK(event);
3442                         break;
3443                 case kThreadWaitKernelRWLockWrite:
3444                 case kThreadWaitKernelRWLockUpgrade:
3445                         rwlck = WRITE_EVENT_TO_RWLOCK(event);
3446                         break;
3447                 default:
3448                         panic("%s was called with an invalid blocking type", __FUNCTION__);
3449                         break;
3450         }
3451         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3452         waitinfo->owner = 0;
3453 }