osfmk/i386/locks_i386.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  *      File:   kern/lock.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *      Date:   1985
  60  *
  61  *      Locking primitives implementation
  62  */
  63
  64 #define LOCK_PRIVATE 1
  65
  66 #include <mach_ldebug.h>
  67
  68 #include <kern/lock_stat.h>
  69 #include <kern/locks.h>
  70 #include <kern/zalloc.h>
  71 #include <kern/misc_protos.h>
  72 #include <kern/thread.h>
  73 #include <kern/processor.h>
  74 #include <kern/cpu_data.h>
  75 #include <kern/cpu_number.h>
  76 #include <kern/sched_prim.h>
  77 #include <kern/debug.h>
  78 #include <string.h>
  79
  80 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
  81 #include <machine/atomic.h>
  82 #include <machine/machine_cpu.h>
  83 #include <i386/mp.h>
  84 #include <machine/atomic.h>
  85 #include <sys/kdebug.h>
  86 #include <i386/locks_i386_inlines.h>
  87 #include <kern/cpu_number.h>
  88 #include <os/hash.h>
  89
  90 #if     CONFIG_DTRACE
  91 #define DTRACE_RW_SHARED        0x0     //reader
  92 #define DTRACE_RW_EXCL          0x1     //writer
  93 #define DTRACE_NO_FLAG          0x0     //not applicable
  94 #endif /* CONFIG_DTRACE */
  95
  96 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  97 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
  98 #define LCK_RW_LCK_SHARED_CODE          0x102
  99 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
 100 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
 101 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 102
 103 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
 104 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
 105 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
 106 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
 107 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
 108 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
 109 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
 110 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
 111
 112
 113 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 114
 115 /* Forwards */
 116
 117 #if     USLOCK_DEBUG
 118 /*
 119  *      Perform simple lock checks.
 120  */
 121 int     uslock_check = 1;
 122 int     max_lock_loops  = 100000000;
 123 decl_simple_lock_data(extern, printf_lock);
 124 decl_simple_lock_data(extern, panic_lock);
 125 #endif  /* USLOCK_DEBUG */
 126
 127 extern unsigned int not_in_kdp;
 128
 129 #if !LOCK_STATS
 130 #define usimple_lock_nopreempt(lck, grp) \
 131         usimple_lock_nopreempt(lck)
 132 #define usimple_lock_try_nopreempt(lck, grp) \
 133         usimple_lock_try_nopreempt(lck)
 134 #endif
 135 static void usimple_lock_nopreempt(usimple_lock_t, lck_grp_t *);
 136 static unsigned int usimple_lock_try_nopreempt(usimple_lock_t, lck_grp_t *);
 137
 138 /*
 139  *      We often want to know the addresses of the callers
 140  *      of the various lock routines.  However, this information
 141  *      is only used for debugging and statistics.
 142  */
 143 typedef void    *pc_t;
 144 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 145 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 146 #if     ANY_LOCK_DEBUG
 147 #define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
 148 #define DECL_PC(pc)     pc_t pc;
 149 #else   /* ANY_LOCK_DEBUG */
 150 #define DECL_PC(pc)
 151 #ifdef  lint
 152 /*
 153  *      Eliminate lint complaints about unused local pc variables.
 154  */
 155 #define OBTAIN_PC(pc)   ++pc
 156 #else   /* lint */
 157 #define OBTAIN_PC(pc)
 158 #endif  /* lint */
 159 #endif  /* USLOCK_DEBUG */
 160
 161 ZONE_VIEW_DEFINE(ZV_LCK_SPIN, "lck_spin",
 162     KHEAP_ID_DEFAULT, sizeof(lck_spin_t));
 163
 164 ZONE_VIEW_DEFINE(ZV_LCK_MTX, "lck_mtx",
 165     KHEAP_ID_DEFAULT, sizeof(lck_mtx_t));
 166
 167 ZONE_VIEW_DEFINE(ZV_LCK_MTX_EXT, "lck_mtx_ext",
 168     KHEAP_ID_DEFAULT, sizeof(lck_mtx_ext_t));
 169
 170 ZONE_VIEW_DEFINE(ZV_LCK_RW, "lck_rw",
 171     KHEAP_ID_DEFAULT, sizeof(lck_rw_t));
 172
 173 /*
 174  * atomic exchange API is a low level abstraction of the operations
 175  * to atomically read, modify, and write a pointer.  This abstraction works
 176  * for both Intel and ARMv8.1 compare and exchange atomic instructions as
 177  * well as the ARM exclusive instructions.
 178  *
 179  * atomic_exchange_begin() - begin exchange and retrieve current value
 180  * atomic_exchange_complete() - conclude an exchange
 181  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
 182  */
 183 static uint32_t
 184 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 185 {
 186         uint32_t        val;
 187
 188         (void)ord;                      // Memory order not used
 189         val = os_atomic_load(target, relaxed);
 190         *previous = val;
 191         return val;
 192 }
 193
 194 static boolean_t
 195 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 196 {
 197         return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
 198 }
 199
 200 static void
 201 atomic_exchange_abort(void)
 202 {
 203 }
 204
 205 static boolean_t
 206 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 207 {
 208         uint32_t        value, prev;
 209
 210         for (;;) {
 211                 value = atomic_exchange_begin32(target, &prev, ord);
 212                 if (value & test_mask) {
 213                         if (wait) {
 214                                 cpu_pause();
 215                         } else {
 216                                 atomic_exchange_abort();
 217                         }
 218                         return FALSE;
 219                 }
 220                 value |= set_mask;
 221                 if (atomic_exchange_complete32(target, prev, value, ord)) {
 222                         return TRUE;
 223                 }
 224         }
 225 }
 226
 227 inline boolean_t
 228 hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 229 {
 230         return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
 231 }
 232
 233 /*
 234  *      Portable lock package implementation of usimple_locks.
 235  */
 236
 237 #if     USLOCK_DEBUG
 238 #define USLDBG(stmt)    stmt
 239 void            usld_lock_init(usimple_lock_t, unsigned short);
 240 void            usld_lock_pre(usimple_lock_t, pc_t);
 241 void            usld_lock_post(usimple_lock_t, pc_t);
 242 void            usld_unlock(usimple_lock_t, pc_t);
 243 void            usld_lock_try_pre(usimple_lock_t, pc_t);
 244 void            usld_lock_try_post(usimple_lock_t, pc_t);
 245 int             usld_lock_common_checks(usimple_lock_t, char *);
 246 #else   /* USLOCK_DEBUG */
 247 #define USLDBG(stmt)
 248 #endif  /* USLOCK_DEBUG */
 249
 250 /*
 251  * Forward definitions
 252  */
 253
 254 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
 255 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
 256 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 257 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 258 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 259 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 260 void lck_rw_clear_promotions_x86(thread_t thread);
 261 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
 262 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
 263 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
 264 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect);
 265 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
 266 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
 267 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
 268 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
 269 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
 270
 271
 272 /*
 273  *      Routine:        lck_spin_alloc_init
 274  */
 275 lck_spin_t *
 276 lck_spin_alloc_init(
 277         lck_grp_t       *grp,
 278         lck_attr_t      *attr)
 279 {
 280         lck_spin_t *lck;
 281
 282         lck = zalloc(ZV_LCK_SPIN);
 283         lck_spin_init(lck, grp, attr);
 284         return lck;
 285 }
 286
 287 /*
 288  *      Routine:        lck_spin_free
 289  */
 290 void
 291 lck_spin_free(
 292         lck_spin_t      *lck,
 293         lck_grp_t       *grp)
 294 {
 295         lck_spin_destroy(lck, grp);
 296         zfree(ZV_LCK_SPIN, lck);
 297 }
 298
 299 /*
 300  *      Routine:        lck_spin_init
 301  */
 302 void
 303 lck_spin_init(
 304         lck_spin_t      *lck,
 305         lck_grp_t       *grp,
 306         __unused lck_attr_t     *attr)
 307 {
 308         usimple_lock_init((usimple_lock_t) lck, 0);
 309         if (grp) {
 310                 lck_grp_reference(grp);
 311                 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 312         }
 313 }
 314
 315 /*
 316  *      Routine:        lck_spin_destroy
 317  */
 318 void
 319 lck_spin_destroy(
 320         lck_spin_t      *lck,
 321         lck_grp_t       *grp)
 322 {
 323         if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
 324                 return;
 325         }
 326         lck->interlock = LCK_SPIN_TAG_DESTROYED;
 327         if (grp) {
 328                 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 329                 lck_grp_deallocate(grp);
 330         }
 331         return;
 332 }
 333
 334 /*
 335  *      Routine:        lck_spin_lock
 336  */
 337 void
 338 lck_spin_lock_grp(
 339         lck_spin_t      *lck,
 340         lck_grp_t       *grp)
 341 {
 342 #pragma unused(grp)
 343         usimple_lock((usimple_lock_t) lck, grp);
 344 }
 345
 346 void
 347 lck_spin_lock(
 348         lck_spin_t      *lck)
 349 {
 350         usimple_lock((usimple_lock_t) lck, NULL);
 351 }
 352
 353 void
 354 lck_spin_lock_nopreempt(
 355         lck_spin_t      *lck)
 356 {
 357         usimple_lock_nopreempt((usimple_lock_t) lck, NULL);
 358 }
 359
 360 void
 361 lck_spin_lock_nopreempt_grp(
 362         lck_spin_t      *lck,
 363         lck_grp_t       *grp)
 364 {
 365 #pragma unused(grp)
 366         usimple_lock_nopreempt((usimple_lock_t) lck, grp);
 367 }
 368
 369 /*
 370  *      Routine:        lck_spin_unlock
 371  */
 372 void
 373 lck_spin_unlock(
 374         lck_spin_t      *lck)
 375 {
 376         usimple_unlock((usimple_lock_t) lck);
 377 }
 378
 379 void
 380 lck_spin_unlock_nopreempt(
 381         lck_spin_t      *lck)
 382 {
 383         usimple_unlock_nopreempt((usimple_lock_t) lck);
 384 }
 385
 386 boolean_t
 387 lck_spin_try_lock_grp(
 388         lck_spin_t      *lck,
 389         lck_grp_t       *grp)
 390 {
 391 #pragma unused(grp)
 392         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
 393 #if     DEVELOPMENT || DEBUG
 394         if (lrval) {
 395                 pltrace(FALSE);
 396         }
 397 #endif
 398         return lrval;
 399 }
 400
 401
 402 /*
 403  *      Routine:        lck_spin_try_lock
 404  */
 405 boolean_t
 406 lck_spin_try_lock(
 407         lck_spin_t      *lck)
 408 {
 409         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
 410 #if     DEVELOPMENT || DEBUG
 411         if (lrval) {
 412                 pltrace(FALSE);
 413         }
 414 #endif
 415         return lrval;
 416 }
 417
 418 int
 419 lck_spin_try_lock_nopreempt(
 420         lck_spin_t      *lck)
 421 {
 422         boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, LCK_GRP_NULL);
 423 #if     DEVELOPMENT || DEBUG
 424         if (lrval) {
 425                 pltrace(FALSE);
 426         }
 427 #endif
 428         return lrval;
 429 }
 430
 431 int
 432 lck_spin_try_lock_nopreempt_grp(
 433         lck_spin_t      *lck,
 434         lck_grp_t       *grp)
 435 {
 436 #pragma unused(grp)
 437         boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, grp);
 438 #if     DEVELOPMENT || DEBUG
 439         if (lrval) {
 440                 pltrace(FALSE);
 441         }
 442 #endif
 443         return lrval;
 444 }
 445
 446 /*
 447  *      Routine:        lck_spin_assert
 448  */
 449 void
 450 lck_spin_assert(lck_spin_t *lock, unsigned int type)
 451 {
 452         thread_t thread, holder;
 453         uintptr_t state;
 454
 455         if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
 456                 panic("lck_spin_assert(): invalid arg (%u)", type);
 457         }
 458
 459         state = lock->interlock;
 460         holder = (thread_t)state;
 461         thread = current_thread();
 462         if (type == LCK_ASSERT_OWNED) {
 463                 if (__improbable(holder == THREAD_NULL)) {
 464                         panic("Lock not owned %p = %lx", lock, state);
 465                 }
 466                 if (__improbable(holder != thread)) {
 467                         panic("Lock not owned by current thread %p = %lx", lock, state);
 468                 }
 469         } else if (type == LCK_ASSERT_NOTOWNED) {
 470                 if (__improbable(holder != THREAD_NULL)) {
 471                         if (holder == thread) {
 472                                 panic("Lock owned by current thread %p = %lx", lock, state);
 473                         }
 474                 }
 475         }
 476 }
 477
 478 /*
 479  *      Routine: kdp_lck_spin_is_acquired
 480  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 481  *      Returns: TRUE if lock is acquired.
 482  */
 483 boolean_t
 484 kdp_lck_spin_is_acquired(lck_spin_t *lck)
 485 {
 486         if (not_in_kdp) {
 487                 panic("panic: spinlock acquired check done outside of kernel debugger");
 488         }
 489         return (lck->interlock != 0)? TRUE : FALSE;
 490 }
 491
 492 /*
 493  *      Initialize a usimple_lock.
 494  *
 495  *      No change in preemption state.
 496  */
 497 void
 498 usimple_lock_init(
 499         usimple_lock_t  l,
 500         __unused unsigned short tag)
 501 {
 502         USLDBG(usld_lock_init(l, tag));
 503         hw_lock_init(&l->interlock);
 504 }
 505
 506 volatile uint32_t spinlock_owner_cpu = ~0;
 507 volatile usimple_lock_t spinlock_timed_out;
 508
 509 uint32_t
 510 spinlock_timeout_NMI(uintptr_t thread_addr)
 511 {
 512         uint32_t i;
 513
 514         for (i = 0; i < real_ncpus; i++) {
 515                 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
 516                         spinlock_owner_cpu = i;
 517                         if ((uint32_t) cpu_number() != i) {
 518                                 /* Cause NMI and panic on the owner's cpu */
 519                                 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
 520                         }
 521                         break;
 522                 }
 523         }
 524
 525         return spinlock_owner_cpu;
 526 }
 527
 528 __abortlike
 529 static void
 530 usimple_lock_acquire_timeout_panic(usimple_lock_t l)
 531 {
 532         uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
 533         uint32_t lock_cpu;
 534
 535         spinlock_timed_out = l; /* spinlock_timeout_NMI consumes this */
 536         lock_cpu = spinlock_timeout_NMI(lowner);
 537         panic("Spinlock acquisition timed out: lock=%p, "
 538             "lock owner thread=0x%lx, current_thread: %p, "
 539             "lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
 540             l, lowner, current_thread(), lock_cpu,
 541             (uintptr_t)l->interlock.lock_data, mach_absolute_time());
 542 }
 543
 544 /*
 545  *      Acquire a usimple_lock.
 546  *
 547  *      Returns with preemption disabled.  Note
 548  *      that the hw_lock routines are responsible for
 549  *      maintaining preemption state.
 550  */
 551 void
 552 (usimple_lock)(
 553         usimple_lock_t  l
 554         LCK_GRP_ARG(lck_grp_t *grp))
 555 {
 556         DECL_PC(pc);
 557
 558         OBTAIN_PC(pc);
 559         USLDBG(usld_lock_pre(l, pc));
 560
 561         while (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
 562                 if (!machine_timeout_suspended()) {
 563                         usimple_lock_acquire_timeout_panic(l);
 564                 }
 565                 enable_preemption();
 566         }
 567
 568 #if DEVELOPMENT || DEBUG
 569         pltrace(FALSE);
 570 #endif
 571
 572         USLDBG(usld_lock_post(l, pc));
 573 #if CONFIG_DTRACE
 574         LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
 575 #endif
 576 }
 577
 578 /*
 579  *      Acquire a usimple_lock_nopreempt
 580  *
 581  *      Called and returns with preemption disabled.  Note
 582  *      that the hw_lock routines are responsible for
 583  *      maintaining preemption state.
 584  */
 585 static void
 586 usimple_lock_nopreempt(
 587         usimple_lock_t  l,
 588         lck_grp_t *grp)
 589 {
 590         DECL_PC(pc);
 591
 592         OBTAIN_PC(pc);
 593         USLDBG(usld_lock_pre(l, pc));
 594
 595         while (__improbable(hw_lock_to_nopreempt(&l->interlock, LockTimeOutTSC, grp) == 0)) {
 596                 if (!machine_timeout_suspended()) {
 597                         usimple_lock_acquire_timeout_panic(l);
 598                 }
 599                 enable_preemption();
 600         }
 601
 602 #if DEVELOPMENT || DEBUG
 603         pltrace(FALSE);
 604 #endif
 605
 606         USLDBG(usld_lock_post(l, pc));
 607 #if CONFIG_DTRACE
 608         LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
 609 #endif
 610 }
 611
 612
 613 /*
 614  *      Release a usimple_lock.
 615  *
 616  *      Returns with preemption enabled.  Note
 617  *      that the hw_lock routines are responsible for
 618  *      maintaining preemption state.
 619  */
 620 void
 621 usimple_unlock(
 622         usimple_lock_t  l)
 623 {
 624         DECL_PC(pc);
 625
 626         OBTAIN_PC(pc);
 627         USLDBG(usld_unlock(l, pc));
 628 #if DEVELOPMENT || DEBUG
 629         pltrace(TRUE);
 630 #endif
 631         hw_lock_unlock(&l->interlock);
 632 }
 633
 634 /*
 635  *      Release a usimple_unlock_nopreempt.
 636  *
 637  *      Called and returns with preemption enabled.  Note
 638  *      that the hw_lock routines are responsible for
 639  *      maintaining preemption state.
 640  */
 641 void
 642 usimple_unlock_nopreempt(
 643         usimple_lock_t  l)
 644 {
 645         DECL_PC(pc);
 646
 647         OBTAIN_PC(pc);
 648         USLDBG(usld_unlock(l, pc));
 649 #if DEVELOPMENT || DEBUG
 650         pltrace(TRUE);
 651 #endif
 652         hw_lock_unlock_nopreempt(&l->interlock);
 653 }
 654
 655 /*
 656  *      Conditionally acquire a usimple_lock.
 657  *
 658  *      On success, returns with preemption disabled.
 659  *      On failure, returns with preemption in the same state
 660  *      as when first invoked.  Note that the hw_lock routines
 661  *      are responsible for maintaining preemption state.
 662  *
 663  *      XXX No stats are gathered on a miss; I preserved this
 664  *      behavior from the original assembly-language code, but
 665  *      doesn't it make sense to log misses?  XXX
 666  */
 667 unsigned int
 668 usimple_lock_try(
 669         usimple_lock_t  l,
 670         lck_grp_t *grp)
 671 {
 672         unsigned int    success;
 673         DECL_PC(pc);
 674
 675         OBTAIN_PC(pc);
 676         USLDBG(usld_lock_try_pre(l, pc));
 677         if ((success = hw_lock_try(&l->interlock, grp))) {
 678 #if DEVELOPMENT || DEBUG
 679                 pltrace(FALSE);
 680 #endif
 681                 USLDBG(usld_lock_try_post(l, pc));
 682         }
 683         return success;
 684 }
 685
 686 /*
 687  *      Conditionally acquire a usimple_lock.
 688  *
 689  *      Called and returns with preemption disabled.  Note
 690  *      that the hw_lock routines are responsible for
 691  *      maintaining preemption state.
 692  *
 693  *      XXX No stats are gathered on a miss; I preserved this
 694  *      behavior from the original assembly-language code, but
 695  *      doesn't it make sense to log misses?  XXX
 696  */
 697 static unsigned int
 698 usimple_lock_try_nopreempt(
 699         usimple_lock_t  l,
 700         lck_grp_t *grp)
 701 {
 702         unsigned int    success;
 703         DECL_PC(pc);
 704
 705         OBTAIN_PC(pc);
 706         USLDBG(usld_lock_try_pre(l, pc));
 707         if ((success = hw_lock_try_nopreempt(&l->interlock, grp))) {
 708 #if DEVELOPMENT || DEBUG
 709                 pltrace(FALSE);
 710 #endif
 711                 USLDBG(usld_lock_try_post(l, pc));
 712         }
 713         return success;
 714 }
 715
 716 /*
 717  * Acquire a usimple_lock while polling for pending cpu signals
 718  * and spinning on a lock.
 719  *
 720  */
 721 unsigned
 722 int
 723 (usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
 724     uint64_t deadline
 725     LCK_GRP_ARG(lck_grp_t *grp))
 726 {
 727         boolean_t istate = ml_get_interrupts_enabled();
 728
 729         if (deadline < mach_absolute_time()) {
 730                 return 0;
 731         }
 732
 733         while (!simple_lock_try(l, grp)) {
 734                 if (!istate) {
 735                         cpu_signal_handler(NULL);
 736                 }
 737
 738                 if (deadline < mach_absolute_time()) {
 739                         return 0;
 740                 }
 741
 742                 cpu_pause();
 743         }
 744
 745         return 1;
 746 }
 747
 748 void
 749 (usimple_lock_try_lock_loop)(usimple_lock_t l
 750     LCK_GRP_ARG(lck_grp_t *grp))
 751 {
 752         /* When the lock is not contended, grab the lock and go. */
 753         if (!simple_lock_try(l, grp)) {
 754                 usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
 755         }
 756 }
 757
 758 unsigned
 759 int
 760 (usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
 761     uint64_t duration
 762     LCK_GRP_ARG(lck_grp_t *grp))
 763 {
 764         uint64_t deadline;
 765         uint64_t base_at;
 766         uint64_t duration_at;
 767
 768         /* Fast track for uncontended locks */
 769         if (simple_lock_try(l, grp)) {
 770                 return 1;
 771         }
 772
 773         base_at = mach_absolute_time();
 774
 775         nanoseconds_to_absolutetime(duration, &duration_at);
 776         deadline = base_at + duration_at;
 777         if (deadline < base_at) {
 778                 /* deadline has overflowed, make it saturate */
 779                 deadline = ULLONG_MAX;
 780         }
 781
 782         return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
 783 }
 784
 785 #if     USLOCK_DEBUG
 786 /*
 787  *      States of a usimple_lock.  The default when initializing
 788  *      a usimple_lock is setting it up for debug checking.
 789  */
 790 #define USLOCK_CHECKED          0x0001          /* lock is being checked */
 791 #define USLOCK_TAKEN            0x0002          /* lock has been taken */
 792 #define USLOCK_INIT             0xBAA0          /* lock has been initialized */
 793 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 794 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 795                                  ((l)->debug.state & USLOCK_CHECKED))
 796
 797 /*
 798  *      Initialize the debugging information contained
 799  *      in a usimple_lock.
 800  */
 801 void
 802 usld_lock_init(
 803         usimple_lock_t  l,
 804         __unused unsigned short tag)
 805 {
 806         if (l == USIMPLE_LOCK_NULL) {
 807                 panic("lock initialization:  null lock pointer");
 808         }
 809         l->lock_type = USLOCK_TAG;
 810         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 811         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 812         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 813         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 814         l->debug.duration[0] = l->debug.duration[1] = 0;
 815         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 816         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 817         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 818 }
 819
 820
 821 /*
 822  *      These checks apply to all usimple_locks, not just
 823  *      those with USLOCK_CHECKED turned on.
 824  */
 825 int
 826 usld_lock_common_checks(
 827         usimple_lock_t  l,
 828         char            *caller)
 829 {
 830         if (l == USIMPLE_LOCK_NULL) {
 831                 panic("%s:  null lock pointer", caller);
 832         }
 833         if (l->lock_type != USLOCK_TAG) {
 834                 panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
 835         }
 836         if (!(l->debug.state & USLOCK_INIT)) {
 837                 panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
 838         }
 839         return USLOCK_CHECKING(l);
 840 }
 841
 842
 843 /*
 844  *      Debug checks on a usimple_lock just before attempting
 845  *      to acquire it.
 846  */
 847 /* ARGSUSED */
 848 void
 849 usld_lock_pre(
 850         usimple_lock_t  l,
 851         pc_t            pc)
 852 {
 853         char    caller[] = "usimple_lock";
 854
 855
 856         if (!usld_lock_common_checks(l, caller)) {
 857                 return;
 858         }
 859
 860 /*
 861  *      Note that we have a weird case where we are getting a lock when we are]
 862  *      in the process of putting the system to sleep. We are running with no
 863  *      current threads, therefore we can't tell if we are trying to retake a lock
 864  *      we have or someone on the other processor has it.  Therefore we just
 865  *      ignore this test if the locking thread is 0.
 866  */
 867
 868         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 869             l->debug.lock_thread == (void *) current_thread()) {
 870                 printf("%s:  lock %p already locked (at %p) by",
 871                     caller, l, l->debug.lock_pc);
 872                 printf(" current thread %p (new attempt at pc %p)\n",
 873                     l->debug.lock_thread, pc);
 874                 panic("%s", caller);
 875         }
 876         mp_disable_preemption();
 877         mp_enable_preemption();
 878 }
 879
 880
 881 /*
 882  *      Debug checks on a usimple_lock just after acquiring it.
 883  *
 884  *      Pre-emption has been disabled at this point,
 885  *      so we are safe in using cpu_number.
 886  */
 887 void
 888 usld_lock_post(
 889         usimple_lock_t  l,
 890         pc_t            pc)
 891 {
 892         unsigned int mycpu;
 893         char    caller[] = "successful usimple_lock";
 894
 895
 896         if (!usld_lock_common_checks(l, caller)) {
 897                 return;
 898         }
 899
 900         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
 901                 panic("%s:  lock %p became uninitialized",
 902                     caller, l);
 903         }
 904         if ((l->debug.state & USLOCK_TAKEN)) {
 905                 panic("%s:  lock 0x%p became TAKEN by someone else",
 906                     caller, l);
 907         }
 908
 909         mycpu = (unsigned int)cpu_number();
 910         assert(mycpu <= UCHAR_MAX);
 911
 912         l->debug.lock_thread = (void *)current_thread();
 913         l->debug.state |= USLOCK_TAKEN;
 914         l->debug.lock_pc = pc;
 915         l->debug.lock_cpu = (unsigned char)mycpu;
 916 }
 917
 918
 919 /*
 920  *      Debug checks on a usimple_lock just before
 921  *      releasing it.  Note that the caller has not
 922  *      yet released the hardware lock.
 923  *
 924  *      Preemption is still disabled, so there's
 925  *      no problem using cpu_number.
 926  */
 927 void
 928 usld_unlock(
 929         usimple_lock_t  l,
 930         pc_t            pc)
 931 {
 932         unsigned int mycpu;
 933         char    caller[] = "usimple_unlock";
 934
 935
 936         if (!usld_lock_common_checks(l, caller)) {
 937                 return;
 938         }
 939
 940         mycpu = cpu_number();
 941         assert(mycpu <= UCHAR_MAX);
 942
 943         if (!(l->debug.state & USLOCK_TAKEN)) {
 944                 panic("%s:  lock 0x%p hasn't been taken",
 945                     caller, l);
 946         }
 947         if (l->debug.lock_thread != (void *) current_thread()) {
 948                 panic("%s:  unlocking lock 0x%p, owned by thread %p",
 949                     caller, l, l->debug.lock_thread);
 950         }
 951         if (l->debug.lock_cpu != mycpu) {
 952                 printf("%s:  unlocking lock 0x%p on cpu 0x%x",
 953                     caller, l, mycpu);
 954                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 955                 panic("%s", caller);
 956         }
 957
 958         l->debug.unlock_thread = l->debug.lock_thread;
 959         l->debug.lock_thread = INVALID_PC;
 960         l->debug.state &= ~USLOCK_TAKEN;
 961         l->debug.unlock_pc = pc;
 962         l->debug.unlock_cpu = (unsigned char)mycpu;
 963 }
 964
 965
 966 /*
 967  *      Debug checks on a usimple_lock just before
 968  *      attempting to acquire it.
 969  *
 970  *      Preemption isn't guaranteed to be disabled.
 971  */
 972 void
 973 usld_lock_try_pre(
 974         usimple_lock_t  l,
 975         __unused pc_t   pc)
 976 {
 977         char    caller[] = "usimple_lock_try";
 978
 979         if (!usld_lock_common_checks(l, caller)) {
 980                 return;
 981         }
 982 }
 983
 984
 985 /*
 986  *      Debug checks on a usimple_lock just after
 987  *      successfully attempting to acquire it.
 988  *
 989  *      Preemption has been disabled by the
 990  *      lock acquisition attempt, so it's safe
 991  *      to use cpu_number.
 992  */
 993 void
 994 usld_lock_try_post(
 995         usimple_lock_t  l,
 996         pc_t            pc)
 997 {
 998         unsigned int mycpu;
 999         char    caller[] = "successful usimple_lock_try";
1000
1001         if (!usld_lock_common_checks(l, caller)) {
1002                 return;
1003         }
1004
1005         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
1006                 panic("%s:  lock 0x%p became uninitialized",
1007                     caller, l);
1008         }
1009         if ((l->debug.state & USLOCK_TAKEN)) {
1010                 panic("%s:  lock 0x%p became TAKEN by someone else",
1011                     caller, l);
1012         }
1013
1014         mycpu = cpu_number();
1015         assert(mycpu <= UCHAR_MAX);
1016
1017         l->debug.lock_thread = (void *) current_thread();
1018         l->debug.state |= USLOCK_TAKEN;
1019         l->debug.lock_pc = pc;
1020         l->debug.lock_cpu = (unsigned char)mycpu;
1021 }
1022 #endif  /* USLOCK_DEBUG */
1023
1024 /*
1025  *      Routine:        lck_rw_alloc_init
1026  */
1027 lck_rw_t *
1028 lck_rw_alloc_init(
1029         lck_grp_t       *grp,
1030         lck_attr_t      *attr)
1031 {
1032         lck_rw_t *lck;
1033
1034         lck = zalloc_flags(ZV_LCK_RW, Z_WAITOK | Z_ZERO);
1035         lck_rw_init(lck, grp, attr);
1036         return lck;
1037 }
1038
1039 /*
1040  *      Routine:        lck_rw_free
1041  */
1042 void
1043 lck_rw_free(
1044         lck_rw_t        *lck,
1045         lck_grp_t       *grp)
1046 {
1047         lck_rw_destroy(lck, grp);
1048         zfree(ZV_LCK_RW, lck);
1049 }
1050
1051 /*
1052  *      Routine:        lck_rw_init
1053  */
1054 void
1055 lck_rw_init(
1056         lck_rw_t        *lck,
1057         lck_grp_t       *grp,
1058         lck_attr_t      *attr)
1059 {
1060         lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
1061             attr : &LockDefaultLckAttr;
1062
1063         hw_lock_byte_init(&lck->lck_rw_interlock);
1064         lck->lck_rw_want_write = FALSE;
1065         lck->lck_rw_want_upgrade = FALSE;
1066         lck->lck_rw_shared_count = 0;
1067         lck->lck_rw_can_sleep = TRUE;
1068         lck->lck_r_waiting = lck->lck_w_waiting = 0;
1069         lck->lck_rw_tag = 0;
1070         lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
1071             LCK_ATTR_RW_SHARED_PRIORITY) == 0);
1072
1073         lck_grp_reference(grp);
1074         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
1075 }
1076
1077 /*
1078  *      Routine:        lck_rw_destroy
1079  */
1080 void
1081 lck_rw_destroy(
1082         lck_rw_t        *lck,
1083         lck_grp_t       *grp)
1084 {
1085         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
1086                 return;
1087         }
1088 #if MACH_LDEBUG
1089         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
1090 #endif
1091         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
1092         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
1093         lck_grp_deallocate(grp);
1094         return;
1095 }
1096
1097 /*
1098  *      Sleep locks.  These use the same data structure and algorithm
1099  *      as the spin locks, but the process sleeps while it is waiting
1100  *      for the lock.  These work on uniprocessor systems.
1101  */
1102
1103 #define DECREMENTER_TIMEOUT 1000000
1104
1105 /*
1106  * We disable interrupts while holding the RW interlock to prevent an
1107  * interrupt from exacerbating hold time.
1108  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
1109  */
1110 static inline boolean_t
1111 lck_interlock_lock(lck_rw_t *lck)
1112 {
1113         boolean_t       istate;
1114
1115         istate = ml_set_interrupts_enabled(FALSE);
1116         hw_lock_byte_lock(&lck->lck_rw_interlock);
1117         return istate;
1118 }
1119
1120 static inline void
1121 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
1122 {
1123         hw_lock_byte_unlock(&lck->lck_rw_interlock);
1124         ml_set_interrupts_enabled(istate);
1125 }
1126
1127 /*
1128  * This inline is used when busy-waiting for an rw lock.
1129  * If interrupts were disabled when the lock primitive was called,
1130  * we poll the IPI handler for pending tlb flushes.
1131  * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
1132  */
1133 static inline void
1134 lck_rw_lock_pause(boolean_t interrupts_enabled)
1135 {
1136         if (!interrupts_enabled) {
1137                 handle_pending_TLB_flushes();
1138         }
1139         cpu_pause();
1140 }
1141
1142 static inline boolean_t
1143 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
1144 {
1145         if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) {
1146                 return TRUE;
1147         }
1148         return FALSE;
1149 }
1150
1151 /*
1152  * compute the deadline to spin against when
1153  * waiting for a change of state on a lck_rw_t
1154  */
1155 static inline uint64_t
1156 lck_rw_deadline_for_spin(lck_rw_t *lck)
1157 {
1158         if (lck->lck_rw_can_sleep) {
1159                 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
1160                         /*
1161                          * there are already threads waiting on this lock... this
1162                          * implies that they have spun beyond their deadlines waiting for
1163                          * the desired state to show up so we will not bother spinning at this time...
1164                          *   or
1165                          * the current number of threads sharing this lock exceeds our capacity to run them
1166                          * concurrently and since all states we're going to spin for require the rw_shared_count
1167                          * to be at 0, we'll not bother spinning since the latency for this to happen is
1168                          * unpredictable...
1169                          */
1170                         return mach_absolute_time();
1171                 }
1172                 return mach_absolute_time() + MutexSpin;
1173         } else {
1174                 return mach_absolute_time() + (100000LL * 1000000000LL);
1175         }
1176 }
1177
1178
1179 /*
1180  * Spin while interlock is held.
1181  */
1182
1183 static inline void
1184 lck_rw_interlock_spin(lck_rw_t *lock)
1185 {
1186         while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
1187                 cpu_pause();
1188         }
1189 }
1190
1191 static boolean_t
1192 lck_rw_grab_want(lck_rw_t *lock)
1193 {
1194         uint32_t        data, prev;
1195
1196         for (;;) {
1197                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
1198                 if ((data & LCK_RW_INTERLOCK) == 0) {
1199                         break;
1200                 }
1201                 atomic_exchange_abort();
1202                 lck_rw_interlock_spin(lock);
1203         }
1204         if (data & LCK_RW_WANT_WRITE) {
1205                 atomic_exchange_abort();
1206                 return FALSE;
1207         }
1208         data |= LCK_RW_WANT_WRITE;
1209         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1210 }
1211
1212 static boolean_t
1213 lck_rw_grab_shared(lck_rw_t *lock)
1214 {
1215         uint32_t        data, prev;
1216
1217         for (;;) {
1218                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1219                 if ((data & LCK_RW_INTERLOCK) == 0) {
1220                         break;
1221                 }
1222                 atomic_exchange_abort();
1223                 lck_rw_interlock_spin(lock);
1224         }
1225         if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1226                 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1227                         atomic_exchange_abort();
1228                         return FALSE;
1229                 }
1230         }
1231         data += LCK_RW_SHARED_READER;
1232         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1233 }
1234
1235 /*
1236  *      Routine:        lck_rw_lock_exclusive
1237  */
1238 static void
1239 lck_rw_lock_exclusive_gen(
1240         lck_rw_t        *lck)
1241 {
1242         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1243         uint64_t        deadline = 0;
1244         int             slept = 0;
1245         int             gotlock = 0;
1246         int             lockheld = 0;
1247         wait_result_t   res = 0;
1248         boolean_t       istate = -1;
1249
1250 #if     CONFIG_DTRACE
1251         boolean_t dtrace_ls_initialized = FALSE;
1252         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
1253         uint64_t wait_interval = 0;
1254         int readers_at_sleep = 0;
1255 #endif
1256
1257         /*
1258          *      Try to acquire the lck_rw_want_write bit.
1259          */
1260         while (!lck_rw_grab_want(lck)) {
1261 #if     CONFIG_DTRACE
1262                 if (dtrace_ls_initialized == FALSE) {
1263                         dtrace_ls_initialized = TRUE;
1264                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1265                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1266                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1267                         if (dtrace_ls_enabled) {
1268                                 /*
1269                                  * Either sleeping or spinning is happening,
1270                                  *  start a timing of our delay interval now.
1271                                  */
1272                                 readers_at_sleep = lck->lck_rw_shared_count;
1273                                 wait_interval = mach_absolute_time();
1274                         }
1275                 }
1276 #endif
1277                 if (istate == -1) {
1278                         istate = ml_get_interrupts_enabled();
1279                 }
1280
1281                 deadline = lck_rw_deadline_for_spin(lck);
1282
1283                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1284
1285                 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) {
1286                         lck_rw_lock_pause(istate);
1287                 }
1288
1289                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1290
1291                 if (gotlock) {
1292                         break;
1293                 }
1294                 /*
1295                  * if we get here, the deadline has expired w/o us
1296                  * being able to grab the lock exclusively
1297                  * check to see if we're allowed to do a thread_block
1298                  */
1299                 if (lck->lck_rw_can_sleep) {
1300                         istate = lck_interlock_lock(lck);
1301
1302                         if (lck->lck_rw_want_write) {
1303                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1304
1305                                 lck->lck_w_waiting = TRUE;
1306
1307                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1308                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1309                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1310                                 lck_interlock_unlock(lck, istate);
1311
1312                                 if (res == THREAD_WAITING) {
1313                                         res = thread_block(THREAD_CONTINUE_NULL);
1314                                         slept++;
1315                                 }
1316                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1317                         } else {
1318                                 lck->lck_rw_want_write = TRUE;
1319                                 lck_interlock_unlock(lck, istate);
1320                                 break;
1321                         }
1322                 }
1323         }
1324         /*
1325          * Wait for readers (and upgrades) to finish...
1326          * the test for these conditions must be done simultaneously with
1327          * a check of the interlock not being held since
1328          * the rw_shared_count will drop to 0 first and then want_upgrade
1329          * will be set to 1 in the shared_to_exclusive scenario... those
1330          * adjustments are done behind the interlock and represent an
1331          * atomic change in state and must be considered as such
1332          * however, once we see the read count at 0, the want_upgrade not set
1333          * and the interlock not held, we are safe to proceed
1334          */
1335         while (lck_rw_held_read_or_upgrade(lck)) {
1336 #if     CONFIG_DTRACE
1337                 /*
1338                  * Either sleeping or spinning is happening, start
1339                  * a timing of our delay interval now.  If we set it
1340                  * to -1 we don't have accurate data so we cannot later
1341                  * decide to record a dtrace spin or sleep event.
1342                  */
1343                 if (dtrace_ls_initialized == FALSE) {
1344                         dtrace_ls_initialized = TRUE;
1345                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1346                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1347                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1348                         if (dtrace_ls_enabled) {
1349                                 /*
1350                                  * Either sleeping or spinning is happening,
1351                                  *  start a timing of our delay interval now.
1352                                  */
1353                                 readers_at_sleep = lck->lck_rw_shared_count;
1354                                 wait_interval = mach_absolute_time();
1355                         }
1356                 }
1357 #endif
1358                 if (istate == -1) {
1359                         istate = ml_get_interrupts_enabled();
1360                 }
1361
1362                 deadline = lck_rw_deadline_for_spin(lck);
1363
1364                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1365
1366                 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) {
1367                         lck_rw_lock_pause(istate);
1368                 }
1369
1370                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1371
1372                 if (!lockheld) {
1373                         break;
1374                 }
1375                 /*
1376                  * if we get here, the deadline has expired w/o us
1377                  * being able to grab the lock exclusively
1378                  * check to see if we're allowed to do a thread_block
1379                  */
1380                 if (lck->lck_rw_can_sleep) {
1381                         istate = lck_interlock_lock(lck);
1382
1383                         if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1384                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1385
1386                                 lck->lck_w_waiting = TRUE;
1387
1388                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1389                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1390                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1391                                 lck_interlock_unlock(lck, istate);
1392
1393                                 if (res == THREAD_WAITING) {
1394                                         res = thread_block(THREAD_CONTINUE_NULL);
1395                                         slept++;
1396                                 }
1397                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1398                         } else {
1399                                 lck_interlock_unlock(lck, istate);
1400                                 /*
1401                                  * must own the lock now, since we checked for
1402                                  * readers or upgrade owner behind the interlock
1403                                  * no need for a call to 'lck_rw_held_read_or_upgrade'
1404                                  */
1405                                 break;
1406                         }
1407                 }
1408         }
1409
1410 #if     CONFIG_DTRACE
1411         /*
1412          * Decide what latencies we suffered that are Dtrace events.
1413          * If we have set wait_interval, then we either spun or slept.
1414          * At least we get out from under the interlock before we record
1415          * which is the best we can do here to minimize the impact
1416          * of the tracing.
1417          * If we have set wait_interval to -1, then dtrace was not enabled when we
1418          * started sleeping/spinning so we don't record this event.
1419          */
1420         if (dtrace_ls_enabled == TRUE) {
1421                 if (slept == 0) {
1422                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1423                             mach_absolute_time() - wait_interval, 1);
1424                 } else {
1425                         /*
1426                          * For the blocking case, we also record if when we blocked
1427                          * it was held for read or write, and how many readers.
1428                          * Notice that above we recorded this before we dropped
1429                          * the interlock so the count is accurate.
1430                          */
1431                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1432                             mach_absolute_time() - wait_interval, 1,
1433                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1434                 }
1435         }
1436         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1437 #endif
1438 }
1439
1440 /*
1441  *      Routine:        lck_rw_done
1442  */
1443
1444 lck_rw_type_t
1445 lck_rw_done(lck_rw_t *lock)
1446 {
1447         uint32_t        data, prev;
1448
1449         for (;;) {
1450                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1451                 if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
1452                         atomic_exchange_abort();
1453                         lck_rw_interlock_spin(lock);
1454                         continue;
1455                 }
1456                 if (data & LCK_RW_SHARED_MASK) {
1457                         data -= LCK_RW_SHARED_READER;
1458                         if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
1459                                 goto check_waiters;
1460                         }
1461                 } else {                                        /* if reader count == 0, must be exclusive lock */
1462                         if (data & LCK_RW_WANT_UPGRADE) {
1463                                 data &= ~(LCK_RW_WANT_UPGRADE);
1464                         } else {
1465                                 if (data & LCK_RW_WANT_WRITE) {
1466                                         data &= ~(LCK_RW_WANT_EXCL);
1467                                 } else {                                /* lock is not 'owned', panic */
1468                                         panic("Releasing non-exclusive RW lock without a reader refcount!");
1469                                 }
1470                         }
1471 check_waiters:
1472                         if (prev & LCK_RW_W_WAITING) {
1473                                 data &= ~(LCK_RW_W_WAITING);
1474                                 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
1475                                         data &= ~(LCK_RW_R_WAITING);
1476                                 }
1477                         } else {
1478                                 data &= ~(LCK_RW_R_WAITING);
1479                         }
1480                 }
1481                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
1482                         break;
1483                 }
1484                 cpu_pause();
1485         }
1486         return lck_rw_done_gen(lock, prev);
1487 }
1488
1489 /*
1490  *      Routine:        lck_rw_done_gen
1491  *
1492  *      called from lck_rw_done()
1493  *      prior_lock_state is the value in the 1st
1494  *      word of the lock at the time of a successful
1495  *      atomic compare and exchange with the new value...
1496  *      it represents the state of the lock before we
1497  *      decremented the rw_shared_count or cleared either
1498  *      rw_want_upgrade or rw_want_write and
1499  *      the lck_x_waiting bits...  since the wrapper
1500  *      routine has already changed the state atomically,
1501  *      we just need to decide if we should
1502  *      wake up anyone and what value to return... we do
1503  *      this by examining the state of the lock before
1504  *      we changed it
1505  */
1506 static lck_rw_type_t
1507 lck_rw_done_gen(
1508         lck_rw_t        *lck,
1509         uint32_t        prior_lock_state)
1510 {
1511         lck_rw_t        *fake_lck;
1512         lck_rw_type_t   lock_type;
1513         thread_t        thread;
1514         uint32_t        rwlock_count;
1515
1516         thread = current_thread();
1517         rwlock_count = thread->rwlock_count--;
1518         fake_lck = (lck_rw_t *)&prior_lock_state;
1519
1520         if (lck->lck_rw_can_sleep) {
1521                 /*
1522                  * prior_lock state is a snapshot of the 1st word of the
1523                  * lock in question... we'll fake up a pointer to it
1524                  * and carefully not access anything beyond whats defined
1525                  * in the first word of a lck_rw_t
1526                  */
1527
1528                 if (fake_lck->lck_rw_shared_count <= 1) {
1529                         if (fake_lck->lck_w_waiting) {
1530                                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1531                         }
1532
1533                         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1534                                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1535                         }
1536                 }
1537 #if MACH_LDEBUG
1538                 if (rwlock_count == 0) {
1539                         panic("rw lock count underflow for thread %p", thread);
1540                 }
1541 #endif
1542                 /* Check if dropping the lock means that we need to unpromote */
1543
1544                 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1545                         /* sched_flags checked without lock, but will be rechecked while clearing */
1546                         lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1547                 }
1548         }
1549         if (fake_lck->lck_rw_shared_count) {
1550                 lock_type = LCK_RW_TYPE_SHARED;
1551         } else {
1552                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1553         }
1554
1555 #if CONFIG_DTRACE
1556         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1557 #endif
1558
1559         return lock_type;
1560 }
1561
1562
1563 /*
1564  *      Routine:        lck_rw_unlock
1565  */
1566 void
1567 lck_rw_unlock(
1568         lck_rw_t        *lck,
1569         lck_rw_type_t   lck_rw_type)
1570 {
1571         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1572                 lck_rw_unlock_shared(lck);
1573         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1574                 lck_rw_unlock_exclusive(lck);
1575         } else {
1576                 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1577         }
1578 }
1579
1580
1581 /*
1582  *      Routine:        lck_rw_unlock_shared
1583  */
1584 void
1585 lck_rw_unlock_shared(
1586         lck_rw_t        *lck)
1587 {
1588         lck_rw_type_t   ret;
1589
1590         assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1591         ret = lck_rw_done(lck);
1592
1593         if (ret != LCK_RW_TYPE_SHARED) {
1594                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1595         }
1596 }
1597
1598
1599 /*
1600  *      Routine:        lck_rw_unlock_exclusive
1601  */
1602 void
1603 lck_rw_unlock_exclusive(
1604         lck_rw_t        *lck)
1605 {
1606         lck_rw_type_t   ret;
1607
1608         ret = lck_rw_done(lck);
1609
1610         if (ret != LCK_RW_TYPE_EXCLUSIVE) {
1611                 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1612         }
1613 }
1614
1615
1616 /*
1617  *      Routine:        lck_rw_lock
1618  */
1619 void
1620 lck_rw_lock(
1621         lck_rw_t        *lck,
1622         lck_rw_type_t   lck_rw_type)
1623 {
1624         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1625                 lck_rw_lock_shared(lck);
1626         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1627                 lck_rw_lock_exclusive(lck);
1628         } else {
1629                 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1630         }
1631 }
1632
1633 /*
1634  *      Routine:        lck_rw_lock_shared
1635  */
1636 void
1637 lck_rw_lock_shared(lck_rw_t *lock)
1638 {
1639         uint32_t        data, prev;
1640
1641         current_thread()->rwlock_count++;
1642         for (;;) {
1643                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1644                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1645                         atomic_exchange_abort();
1646                         if (lock->lck_rw_can_sleep) {
1647                                 lck_rw_lock_shared_gen(lock);
1648                         } else {
1649                                 cpu_pause();
1650                                 continue;
1651                         }
1652                         break;
1653                 }
1654                 data += LCK_RW_SHARED_READER;
1655                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1656                         break;
1657                 }
1658                 cpu_pause();
1659         }
1660 #if     CONFIG_DTRACE
1661         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1662 #endif  /* CONFIG_DTRACE */
1663         return;
1664 }
1665
1666 /*
1667  *      Routine:        lck_rw_lock_shared_gen
1668  *      Function:
1669  *              assembly fast path code has determined that this lock
1670  *              is held exclusively... this is where we spin/block
1671  *              until we can acquire the lock in the shared mode
1672  */
1673 static void
1674 lck_rw_lock_shared_gen(
1675         lck_rw_t        *lck)
1676 {
1677         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1678         uint64_t        deadline = 0;
1679         int             gotlock = 0;
1680         int             slept = 0;
1681         wait_result_t   res = 0;
1682         boolean_t       istate = -1;
1683
1684 #if     CONFIG_DTRACE
1685         uint64_t wait_interval = 0;
1686         int readers_at_sleep = 0;
1687         boolean_t dtrace_ls_initialized = FALSE;
1688         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1689 #endif
1690
1691         while (!lck_rw_grab_shared(lck)) {
1692 #if     CONFIG_DTRACE
1693                 if (dtrace_ls_initialized == FALSE) {
1694                         dtrace_ls_initialized = TRUE;
1695                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1696                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1697                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1698                         if (dtrace_ls_enabled) {
1699                                 /*
1700                                  * Either sleeping or spinning is happening,
1701                                  *  start a timing of our delay interval now.
1702                                  */
1703                                 readers_at_sleep = lck->lck_rw_shared_count;
1704                                 wait_interval = mach_absolute_time();
1705                         }
1706                 }
1707 #endif
1708                 if (istate == -1) {
1709                         istate = ml_get_interrupts_enabled();
1710                 }
1711
1712                 deadline = lck_rw_deadline_for_spin(lck);
1713
1714                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1715                     trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1716
1717                 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) {
1718                         lck_rw_lock_pause(istate);
1719                 }
1720
1721                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1722                     trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1723
1724                 if (gotlock) {
1725                         break;
1726                 }
1727                 /*
1728                  * if we get here, the deadline has expired w/o us
1729                  * being able to grab the lock for read
1730                  * check to see if we're allowed to do a thread_block
1731                  */
1732                 if (lck->lck_rw_can_sleep) {
1733                         istate = lck_interlock_lock(lck);
1734
1735                         if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1736                             ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1737                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1738                                     trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1739
1740                                 lck->lck_r_waiting = TRUE;
1741
1742                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1743                                 res = assert_wait(RW_LOCK_READER_EVENT(lck),
1744                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1745                                 lck_interlock_unlock(lck, istate);
1746
1747                                 if (res == THREAD_WAITING) {
1748                                         res = thread_block(THREAD_CONTINUE_NULL);
1749                                         slept++;
1750                                 }
1751                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1752                                     trace_lck, res, slept, 0, 0);
1753                         } else {
1754                                 lck->lck_rw_shared_count++;
1755                                 lck_interlock_unlock(lck, istate);
1756                                 break;
1757                         }
1758                 }
1759         }
1760
1761 #if     CONFIG_DTRACE
1762         if (dtrace_ls_enabled == TRUE) {
1763                 if (slept == 0) {
1764                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1765                 } else {
1766                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1767                             mach_absolute_time() - wait_interval, 0,
1768                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1769                 }
1770         }
1771         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1772 #endif
1773 }
1774
1775 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->data, \
1776             (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1777             LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1778
1779 /*
1780  *      Routine:        lck_rw_lock_exclusive_check_contended
1781  */
1782
1783 bool
1784 lck_rw_lock_exclusive_check_contended(lck_rw_t *lock)
1785 {
1786         bool contended = false;
1787         current_thread()->rwlock_count++;
1788         if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1789 #if     CONFIG_DTRACE
1790                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1791 #endif  /* CONFIG_DTRACE */
1792         } else {
1793                 contended = true;
1794                 lck_rw_lock_exclusive_gen(lock);
1795         }
1796         return contended;
1797 }
1798
1799 /*
1800  *      Routine:        lck_rw_lock_exclusive
1801  */
1802
1803 void
1804 lck_rw_lock_exclusive(lck_rw_t *lock)
1805 {
1806         current_thread()->rwlock_count++;
1807         if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1808 #if     CONFIG_DTRACE
1809                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1810 #endif  /* CONFIG_DTRACE */
1811         } else {
1812                 lck_rw_lock_exclusive_gen(lock);
1813         }
1814 }
1815
1816
1817 /*
1818  *      Routine:        lck_rw_lock_shared_to_exclusive
1819  *
1820  *      False returned upon failure, in this case the shared lock is dropped.
1821  */
1822
1823 boolean_t
1824 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1825 {
1826         uint32_t        data, prev;
1827
1828         for (;;) {
1829                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1830                 if (data & LCK_RW_INTERLOCK) {
1831                         atomic_exchange_abort();
1832                         lck_rw_interlock_spin(lock);
1833                         continue;
1834                 }
1835                 if (data & LCK_RW_WANT_UPGRADE) {
1836                         data -= LCK_RW_SHARED_READER;
1837                         if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1838                                 data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1839                         }
1840                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1841                                 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1842                         }
1843                 } else {
1844                         data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1845                         data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1846                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1847                                 break;
1848                         }
1849                 }
1850                 cpu_pause();
1851         }
1852         /* we now own the WANT_UPGRADE */
1853         if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1854                 lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1855         }
1856 #if     CONFIG_DTRACE
1857         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1858 #endif
1859         return TRUE;
1860 }
1861
1862
1863 /*
1864  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1865  *      Function:
1866  *              assembly fast path code has already dropped our read
1867  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1868  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1869  *              all we need to do here is determine if a wakeup is needed
1870  */
1871 static boolean_t
1872 lck_rw_lock_shared_to_exclusive_failure(
1873         lck_rw_t        *lck,
1874         uint32_t        prior_lock_state)
1875 {
1876         lck_rw_t        *fake_lck;
1877         thread_t        thread = current_thread();
1878         uint32_t        rwlock_count;
1879
1880         /* Check if dropping the lock means that we need to unpromote */
1881         rwlock_count = thread->rwlock_count--;
1882 #if MACH_LDEBUG
1883         if (rwlock_count == 0) {
1884                 panic("rw lock count underflow for thread %p", thread);
1885         }
1886 #endif
1887         fake_lck = (lck_rw_t *)&prior_lock_state;
1888
1889         if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1890                 /*
1891                  *      Someone else has requested upgrade.
1892                  *      Since we've released the read lock, wake
1893                  *      him up if he's blocked waiting
1894                  */
1895                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1896         }
1897
1898         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1899                 /* sched_flags checked without lock, but will be rechecked while clearing */
1900                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1901         }
1902
1903         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1904             VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1905
1906         return FALSE;
1907 }
1908
1909
1910 /*
1911  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1912  *      Function:
1913  *              assembly fast path code has already dropped our read
1914  *              count and successfully acquired 'lck_rw_want_upgrade'
1915  *              we just need to wait for the rest of the readers to drain
1916  *              and then we can return as the exclusive holder of this lock
1917  */
1918 static boolean_t
1919 lck_rw_lock_shared_to_exclusive_success(
1920         lck_rw_t        *lck)
1921 {
1922         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1923         uint64_t        deadline = 0;
1924         int             slept = 0;
1925         int             still_shared = 0;
1926         wait_result_t   res;
1927         boolean_t       istate = -1;
1928
1929 #if     CONFIG_DTRACE
1930         uint64_t wait_interval = 0;
1931         int readers_at_sleep = 0;
1932         boolean_t dtrace_ls_initialized = FALSE;
1933         boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1934 #endif
1935
1936         while (lck->lck_rw_shared_count != 0) {
1937 #if     CONFIG_DTRACE
1938                 if (dtrace_ls_initialized == FALSE) {
1939                         dtrace_ls_initialized = TRUE;
1940                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1941                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1942                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1943                         if (dtrace_ls_enabled) {
1944                                 /*
1945                                  * Either sleeping or spinning is happening,
1946                                  *  start a timing of our delay interval now.
1947                                  */
1948                                 readers_at_sleep = lck->lck_rw_shared_count;
1949                                 wait_interval = mach_absolute_time();
1950                         }
1951                 }
1952 #endif
1953                 if (istate == -1) {
1954                         istate = ml_get_interrupts_enabled();
1955                 }
1956
1957                 deadline = lck_rw_deadline_for_spin(lck);
1958
1959                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1960                     trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1961
1962                 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) {
1963                         lck_rw_lock_pause(istate);
1964                 }
1965
1966                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1967                     trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1968
1969                 if (!still_shared) {
1970                         break;
1971                 }
1972                 /*
1973                  * if we get here, the deadline has expired w/o
1974                  * the rw_shared_count having drained to 0
1975                  * check to see if we're allowed to do a thread_block
1976                  */
1977                 if (lck->lck_rw_can_sleep) {
1978                         istate = lck_interlock_lock(lck);
1979
1980                         if (lck->lck_rw_shared_count != 0) {
1981                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1982                                     trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1983
1984                                 lck->lck_w_waiting = TRUE;
1985
1986                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1987                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1988                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1989                                 lck_interlock_unlock(lck, istate);
1990
1991                                 if (res == THREAD_WAITING) {
1992                                         res = thread_block(THREAD_CONTINUE_NULL);
1993                                         slept++;
1994                                 }
1995                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1996                                     trace_lck, res, slept, 0, 0);
1997                         } else {
1998                                 lck_interlock_unlock(lck, istate);
1999                                 break;
2000                         }
2001                 }
2002         }
2003 #if     CONFIG_DTRACE
2004         /*
2005          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
2006          */
2007         if (dtrace_ls_enabled == TRUE) {
2008                 if (slept == 0) {
2009                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
2010                 } else {
2011                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
2012                             mach_absolute_time() - wait_interval, 1,
2013                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
2014                 }
2015         }
2016         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
2017 #endif
2018         return TRUE;
2019 }
2020
2021 /*
2022  *      Routine:        lck_rw_lock_exclusive_to_shared
2023  */
2024
2025 void
2026 lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
2027 {
2028         uint32_t        data, prev;
2029
2030         for (;;) {
2031                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
2032                 if (data & LCK_RW_INTERLOCK) {
2033                         atomic_exchange_abort();
2034                         lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
2035                         continue;
2036                 }
2037                 data += LCK_RW_SHARED_READER;
2038                 if (data & LCK_RW_WANT_UPGRADE) {
2039                         data &= ~(LCK_RW_WANT_UPGRADE);
2040                 } else {
2041                         data &= ~(LCK_RW_WANT_EXCL);
2042                 }
2043                 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
2044                         data &= ~(LCK_RW_W_WAITING);
2045                 }
2046                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
2047                         break;
2048                 }
2049                 cpu_pause();
2050         }
2051         return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
2052 }
2053
2054
2055 /*
2056  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
2057  *      Function:
2058  *              assembly fast path has already dropped
2059  *              our exclusive state and bumped lck_rw_shared_count
2060  *              all we need to do here is determine if anyone
2061  *              needs to be awakened.
2062  */
2063 static void
2064 lck_rw_lock_exclusive_to_shared_gen(
2065         lck_rw_t        *lck,
2066         uint32_t        prior_lock_state)
2067 {
2068         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
2069         lck_rw_t                *fake_lck;
2070
2071         fake_lck = (lck_rw_t *)&prior_lock_state;
2072
2073         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
2074             trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
2075
2076         /*
2077          * don't wake up anyone waiting to take the lock exclusively
2078          * since we hold a read count... when the read count drops to 0,
2079          * the writers will be woken.
2080          *
2081          * wake up any waiting readers if we don't have any writers waiting,
2082          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
2083          */
2084         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
2085                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
2086         }
2087
2088         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
2089             trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
2090
2091 #if CONFIG_DTRACE
2092         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
2093 #endif
2094 }
2095
2096
2097 /*
2098  *      Routine:        lck_rw_try_lock
2099  */
2100 boolean_t
2101 lck_rw_try_lock(
2102         lck_rw_t        *lck,
2103         lck_rw_type_t   lck_rw_type)
2104 {
2105         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2106                 return lck_rw_try_lock_shared(lck);
2107         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2108                 return lck_rw_try_lock_exclusive(lck);
2109         } else {
2110                 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
2111         }
2112         return FALSE;
2113 }
2114
2115 /*
2116  *      Routine:        lck_rw_try_lock_shared
2117  */
2118
2119 boolean_t
2120 lck_rw_try_lock_shared(lck_rw_t *lock)
2121 {
2122         uint32_t        data, prev;
2123
2124         for (;;) {
2125                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
2126                 if (data & LCK_RW_INTERLOCK) {
2127                         atomic_exchange_abort();
2128                         lck_rw_interlock_spin(lock);
2129                         continue;
2130                 }
2131                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
2132                         atomic_exchange_abort();
2133                         return FALSE;                   /* lock is busy */
2134                 }
2135                 data += LCK_RW_SHARED_READER;           /* Increment reader refcount */
2136                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
2137                         break;
2138                 }
2139                 cpu_pause();
2140         }
2141         current_thread()->rwlock_count++;
2142         /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
2143 #if     CONFIG_DTRACE
2144         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
2145 #endif  /* CONFIG_DTRACE */
2146         return TRUE;
2147 }
2148
2149
2150 /*
2151  *      Routine:        lck_rw_try_lock_exclusive
2152  */
2153
2154 boolean_t
2155 lck_rw_try_lock_exclusive(lck_rw_t *lock)
2156 {
2157         uint32_t        data, prev;
2158
2159         for (;;) {
2160                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
2161                 if (data & LCK_RW_INTERLOCK) {
2162                         atomic_exchange_abort();
2163                         lck_rw_interlock_spin(lock);
2164                         continue;
2165                 }
2166                 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
2167                         atomic_exchange_abort();
2168                         return FALSE;                           /* can't get it */
2169                 }
2170                 data |= LCK_RW_WANT_EXCL;
2171                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
2172                         break;
2173                 }
2174                 cpu_pause();
2175         }
2176
2177         current_thread()->rwlock_count++;
2178 #if     CONFIG_DTRACE
2179         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2180 #endif  /* CONFIG_DTRACE */
2181         return TRUE;
2182 }
2183
2184
2185 void
2186 lck_rw_assert(
2187         lck_rw_t        *lck,
2188         unsigned int    type)
2189 {
2190         switch (type) {
2191         case LCK_RW_ASSERT_SHARED:
2192                 if (lck->lck_rw_shared_count != 0) {
2193                         return;
2194                 }
2195                 break;
2196         case LCK_RW_ASSERT_EXCLUSIVE:
2197                 if ((lck->lck_rw_want_write ||
2198                     lck->lck_rw_want_upgrade) &&
2199                     lck->lck_rw_shared_count == 0) {
2200                         return;
2201                 }
2202                 break;
2203         case LCK_RW_ASSERT_HELD:
2204                 if (lck->lck_rw_want_write ||
2205                     lck->lck_rw_want_upgrade ||
2206                     lck->lck_rw_shared_count != 0) {
2207                         return;
2208                 }
2209                 break;
2210         case LCK_RW_ASSERT_NOTHELD:
2211                 if (!(lck->lck_rw_want_write ||
2212                     lck->lck_rw_want_upgrade ||
2213                     lck->lck_rw_shared_count != 0)) {
2214                         return;
2215                 }
2216                 break;
2217         default:
2218                 break;
2219         }
2220
2221         panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
2222 }
2223
2224 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
2225 #if MACH_LDEBUG
2226 __dead2
2227 #endif
2228 void
2229 lck_rw_clear_promotions_x86(thread_t thread)
2230 {
2231 #if MACH_LDEBUG
2232         /* It's fatal to leave a RW lock locked and return to userspace */
2233         panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
2234 #else
2235         /* Paper over the issue */
2236         thread->rwlock_count = 0;
2237         lck_rw_clear_promotion(thread, 0);
2238 #endif
2239 }
2240
2241 boolean_t
2242 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2243 {
2244         lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2245
2246         if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
2247                 lck_rw_unlock_shared(lck);
2248                 mutex_pause(2);
2249                 lck_rw_lock_shared(lck);
2250                 return TRUE;
2251         }
2252
2253         return FALSE;
2254 }
2255
2256 /*
2257  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2258  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2259  */
2260 boolean_t
2261 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2262 {
2263         if (not_in_kdp) {
2264                 panic("panic: rw lock exclusive check done outside of kernel debugger");
2265         }
2266         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2267 }
2268
2269 /*
2270  * Slow path routines for lck_mtx locking and unlocking functions.
2271  *
2272  * These functions were previously implemented in x86 assembly,
2273  * and some optimizations are in place in this c code to obtain a compiled code
2274  * as performant and compact as the assembly version.
2275  *
2276  * To avoid to inline these functions on the fast path, all functions directly called by
2277  * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2278  * in such a way the fast path can tail call into them. In this way the return address
2279  * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2280  *
2281  * Slow path code is structured in such a way there are no calls to functions that will return
2282  * on the context of the caller function, i.e. all functions called are or tail call functions
2283  * or inline functions. The number of arguments of the tail call functions are less then six,
2284  * so that they can be passed over registers and do not need to be pushed on stack.
2285  * This allows the compiler to not create a stack frame for the functions.
2286  *
2287  * __improbable and __probable are used to compile the slow path code in such a way
2288  * the fast path case will be on a sequence of instructions with as less jumps as possible,
2289  * to make this case the most optimized even if falling through the slow path.
2290  */
2291
2292 /*
2293  * Intel lock invariants:
2294  *
2295  * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2296  *
2297  * The lock owner is promoted to the max priority of all its waiters only if it
2298  * was a lower priority when it acquired or was an owner when a waiter waited.
2299  * Max priority is capped at MAXPRI_PROMOTE.
2300  *
2301  * The last waiter will not be promoted as it is woken up, but the last
2302  * lock owner may not have been the last thread to have been woken up depending on the
2303  * luck of the draw.  Therefore a last-owner may still have the promoted-on-wakeup
2304  * flag set.
2305  *
2306  * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2307  *       priority from dropping priority in the future without having to take thread lock
2308  *       on acquire.
2309  */
2310
2311 /*
2312  *      Routine:        lck_mtx_alloc_init
2313  */
2314 lck_mtx_t *
2315 lck_mtx_alloc_init(
2316         lck_grp_t       *grp,
2317         lck_attr_t      *attr)
2318 {
2319         lck_mtx_t *lck;
2320
2321         lck = zalloc(ZV_LCK_MTX);
2322         lck_mtx_init(lck, grp, attr);
2323         return lck;
2324 }
2325
2326 /*
2327  *      Routine:        lck_mtx_free
2328  */
2329 void
2330 lck_mtx_free(
2331         lck_mtx_t       *lck,
2332         lck_grp_t       *grp)
2333 {
2334         lck_mtx_destroy(lck, grp);
2335         zfree(ZV_LCK_MTX, lck);
2336 }
2337
2338 /*
2339  *      Routine:        lck_mtx_ext_init
2340  */
2341 static void
2342 lck_mtx_ext_init(
2343         lck_mtx_ext_t   *lck,
2344         lck_grp_t       *grp,
2345         lck_attr_t      *attr)
2346 {
2347         bzero((void *)lck, sizeof(lck_mtx_ext_t));
2348
2349         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2350                 lck->lck_mtx_deb.type = MUTEX_TAG;
2351                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2352         }
2353
2354         lck->lck_mtx_grp = grp;
2355
2356         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
2357                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2358         }
2359
2360         lck->lck_mtx.lck_mtx_is_ext = 1;
2361         lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
2362 }
2363
2364 /*
2365  *      Routine:        lck_mtx_init
2366  */
2367 void
2368 lck_mtx_init(
2369         lck_mtx_t       *lck,
2370         lck_grp_t       *grp,
2371         lck_attr_t      *attr)
2372 {
2373         lck_mtx_ext_t   *lck_ext;
2374         lck_attr_t      *lck_attr;
2375
2376         if (attr != LCK_ATTR_NULL) {
2377                 lck_attr = attr;
2378         } else {
2379                 lck_attr = &LockDefaultLckAttr;
2380         }
2381
2382         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2383                 lck_ext = zalloc(ZV_LCK_MTX_EXT);
2384                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2385                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2386                 lck->lck_mtx_ptr = lck_ext;
2387         } else {
2388                 lck->lck_mtx_owner = 0;
2389                 lck->lck_mtx_state = 0;
2390         }
2391         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2392         lck_grp_reference(grp);
2393         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2394 }
2395
2396 /*
2397  *      Routine:        lck_mtx_init_ext
2398  */
2399 void
2400 lck_mtx_init_ext(
2401         lck_mtx_t       *lck,
2402         lck_mtx_ext_t   *lck_ext,
2403         lck_grp_t       *grp,
2404         lck_attr_t      *attr)
2405 {
2406         lck_attr_t      *lck_attr;
2407
2408         if (attr != LCK_ATTR_NULL) {
2409                 lck_attr = attr;
2410         } else {
2411                 lck_attr = &LockDefaultLckAttr;
2412         }
2413
2414         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2415                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2416                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2417                 lck->lck_mtx_ptr = lck_ext;
2418         } else {
2419                 lck->lck_mtx_owner = 0;
2420                 lck->lck_mtx_state = 0;
2421         }
2422         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2423
2424         lck_grp_reference(grp);
2425         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2426 }
2427
2428 static void
2429 lck_mtx_lock_mark_destroyed(
2430         lck_mtx_t *mutex,
2431         boolean_t indirect)
2432 {
2433         uint32_t state;
2434
2435         if (indirect) {
2436                 /* convert to destroyed state */
2437                 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2438                 return;
2439         }
2440
2441         state = ordered_load_mtx_state(mutex);
2442         lck_mtx_interlock_lock(mutex, &state);
2443
2444         ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2445
2446         enable_preemption();
2447 }
2448
2449 /*
2450  *      Routine:        lck_mtx_destroy
2451  */
2452 void
2453 lck_mtx_destroy(
2454         lck_mtx_t       *lck,
2455         lck_grp_t       *grp)
2456 {
2457         boolean_t indirect;
2458
2459         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2460                 return;
2461         }
2462 #if MACH_LDEBUG
2463         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2464 #endif
2465         indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2466
2467         lck_mtx_lock_mark_destroyed(lck, indirect);
2468
2469         if (indirect) {
2470                 zfree(ZV_LCK_MTX_EXT, lck->lck_mtx_ptr);
2471         }
2472         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2473         lck_grp_deallocate(grp);
2474         return;
2475 }
2476
2477
2478 #if DEVELOPMENT | DEBUG
2479 __attribute__((noinline))
2480 void
2481 lck_mtx_owner_check_panic(
2482         lck_mtx_t       *lock)
2483 {
2484         thread_t owner = (thread_t)lock->lck_mtx_owner;
2485         panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2486 }
2487 #endif
2488
2489 __attribute__((always_inline))
2490 static boolean_t
2491 get_indirect_mutex(
2492         lck_mtx_t       **lock,
2493         uint32_t        *state)
2494 {
2495         *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2496         *state = ordered_load_mtx_state(*lock);
2497         return TRUE;
2498 }
2499
2500 /*
2501  * Routine:     lck_mtx_unlock_slow
2502  *
2503  * Unlocks a mutex held by current thread.
2504  *
2505  * It will wake up waiters if necessary.
2506  *
2507  * Interlock can be held.
2508  */
2509 __attribute__((noinline))
2510 void
2511 lck_mtx_unlock_slow(
2512         lck_mtx_t       *lock)
2513 {
2514         thread_t        thread;
2515         uint32_t        state, prev;
2516         boolean_t       indirect = FALSE;
2517
2518         state = ordered_load_mtx_state(lock);
2519
2520         /* Is this an indirect mutex? */
2521         if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2522                 indirect = get_indirect_mutex(&lock, &state);
2523         }
2524
2525         thread = current_thread();
2526
2527 #if DEVELOPMENT | DEBUG
2528         thread_t owner = (thread_t)lock->lck_mtx_owner;
2529         if (__improbable(owner != thread)) {
2530                 lck_mtx_owner_check_panic(lock);
2531         }
2532 #endif
2533
2534         /* check if it is held as a spinlock */
2535         if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
2536                 goto unlock;
2537         }
2538
2539         lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2540
2541 unlock:
2542         /* preemption disabled, interlock held and mutex not held */
2543
2544         /* clear owner */
2545         ordered_store_mtx_owner(lock, 0);
2546         /* keep original state in prev for later evaluation */
2547         prev = state;
2548
2549         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2550 #if     MACH_LDEBUG
2551                 if (thread) {
2552                         thread->mutex_count--;
2553                 }
2554 #endif
2555                 return lck_mtx_unlock_wakeup_tail(lock, state, indirect);
2556         }
2557
2558         /* release interlock, promotion and clear spin flag */
2559         state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
2560         ordered_store_mtx_state_release(lock, state);           /* since I own the interlock, I don't need an atomic update */
2561
2562 #if     MACH_LDEBUG
2563         /* perform lock statistics after drop to prevent delay */
2564         if (thread) {
2565                 thread->mutex_count--;          /* lock statistic */
2566         }
2567 #endif  /* MACH_LDEBUG */
2568
2569         /* re-enable preemption */
2570         lck_mtx_unlock_finish_inline(lock, FALSE);
2571
2572         return;
2573 }
2574
2575 #define LCK_MTX_LCK_WAIT_CODE           0x20
2576 #define LCK_MTX_LCK_WAKEUP_CODE         0x21
2577 #define LCK_MTX_LCK_SPIN_CODE           0x22
2578 #define LCK_MTX_LCK_ACQUIRE_CODE        0x23
2579 #define LCK_MTX_LCK_DEMOTE_CODE         0x24
2580
2581 /*
2582  * Routine:    lck_mtx_unlock_wakeup_tail
2583  *
2584  * Invoked on unlock when there is
2585  * contention, i.e. the assembly routine sees
2586  * that mutex->lck_mtx_waiters != 0
2587  *
2588  * neither the mutex or interlock is held
2589  *
2590  * Note that this routine might not be called if there are pending
2591  * waiters which have previously been woken up, and they didn't
2592  * end up boosting the old owner.
2593  *
2594  * assembly routine previously did the following to mutex:
2595  * (after saving the state in prior_lock_state)
2596  *      decremented lck_mtx_waiters if nonzero
2597  *
2598  * This function needs to be called as a tail call
2599  * to optimize the compiled code.
2600  */
2601 __attribute__((noinline))
2602 static void
2603 lck_mtx_unlock_wakeup_tail(
2604         lck_mtx_t       *mutex,
2605         uint32_t        state,
2606         boolean_t       indirect)
2607 {
2608         struct turnstile *ts;
2609
2610         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2611         kern_return_t did_wake;
2612
2613         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
2614             trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2615
2616         ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
2617
2618         if (mutex->lck_mtx_waiters > 1) {
2619                 /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
2620                 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
2621         } else {
2622                 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2623                 turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE);
2624         }
2625         assert(did_wake == KERN_SUCCESS);
2626
2627         turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2628         turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2629
2630         state -= LCK_MTX_WAITER;
2631         state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
2632         ordered_store_mtx_state_release(mutex, state);
2633
2634         assert(current_thread()->turnstile != NULL);
2635
2636         turnstile_cleanup();
2637
2638         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
2639             trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2640
2641         lck_mtx_unlock_finish_inline(mutex, indirect);
2642 }
2643
2644 /*
2645  * Routine:     lck_mtx_lock_acquire_x86
2646  *
2647  * Invoked on acquiring the mutex when there is
2648  * contention (i.e. the assembly routine sees that
2649  * that mutex->lck_mtx_waiters != 0
2650  *
2651  * mutex is owned...  interlock is held... preemption is disabled
2652  */
2653 __attribute__((always_inline))
2654 static void
2655 lck_mtx_lock_acquire_inline(
2656         lck_mtx_t       *mutex,
2657         struct turnstile *ts)
2658 {
2659         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2660
2661         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
2662             trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2663
2664         thread_t thread = (thread_t)mutex->lck_mtx_owner;       /* faster than current_thread() */
2665         assert(thread->waiting_for_mutex == NULL);
2666
2667         if (mutex->lck_mtx_waiters > 0) {
2668                 if (ts == NULL) {
2669                         ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
2670                 }
2671
2672                 turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
2673                 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2674         }
2675
2676         if (ts != NULL) {
2677                 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2678         }
2679
2680         assert(current_thread()->turnstile != NULL);
2681
2682         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
2683             trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2684 }
2685
2686 void
2687 lck_mtx_lock_acquire_x86(
2688         lck_mtx_t       *mutex)
2689 {
2690         return lck_mtx_lock_acquire_inline(mutex, NULL);
2691 }
2692
2693 /*
2694  * Tail call helpers for lock functions that perform
2695  * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2696  * the caller's compiled code.
2697  */
2698
2699 __attribute__((noinline))
2700 static void
2701 lck_mtx_lock_acquire_tail(
2702         lck_mtx_t       *mutex,
2703         boolean_t       indirect,
2704         struct turnstile *ts)
2705 {
2706         lck_mtx_lock_acquire_inline(mutex, ts);
2707         lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex), indirect);
2708 }
2709
2710 __attribute__((noinline))
2711 static boolean_t
2712 lck_mtx_try_lock_acquire_tail(
2713         lck_mtx_t       *mutex)
2714 {
2715         lck_mtx_lock_acquire_inline(mutex, NULL);
2716         lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2717
2718         return TRUE;
2719 }
2720
2721 __attribute__((noinline))
2722 static void
2723 lck_mtx_convert_spin_acquire_tail(
2724         lck_mtx_t       *mutex)
2725 {
2726         lck_mtx_lock_acquire_inline(mutex, NULL);
2727         lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2728 }
2729
2730 boolean_t
2731 lck_mtx_ilk_unlock(
2732         lck_mtx_t       *mutex)
2733 {
2734         lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2735         return TRUE;
2736 }
2737
2738 static inline void
2739 lck_mtx_interlock_lock_set_and_clear_flags(
2740         lck_mtx_t *mutex,
2741         uint32_t xor_flags,
2742         uint32_t and_flags,
2743         uint32_t *new_state)
2744 {
2745         uint32_t state, prev;
2746         state = *new_state;
2747
2748         for (;;) {
2749                 /* have to wait for interlock to clear */
2750                 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2751                         cpu_pause();
2752                         state = ordered_load_mtx_state(mutex);
2753                 }
2754                 prev = state;                                   /* prev contains snapshot for exchange */
2755                 state |= LCK_MTX_ILOCKED_MSK | xor_flags;       /* pick up interlock */
2756                 state &= ~and_flags;                            /* clear flags */
2757
2758                 disable_preemption();
2759                 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
2760                         break;
2761                 }
2762                 enable_preemption();
2763                 cpu_pause();
2764                 state = ordered_load_mtx_state(mutex);
2765         }
2766         *new_state = state;
2767         return;
2768 }
2769
2770 static inline void
2771 lck_mtx_interlock_lock_clear_flags(
2772         lck_mtx_t *mutex,
2773         uint32_t and_flags,
2774         uint32_t *new_state)
2775 {
2776         return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2777 }
2778
2779 static inline void
2780 lck_mtx_interlock_lock(
2781         lck_mtx_t *mutex,
2782         uint32_t *new_state)
2783 {
2784         return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2785 }
2786
2787 static inline int
2788 lck_mtx_interlock_try_lock_set_flags(
2789         lck_mtx_t *mutex,
2790         uint32_t or_flags,
2791         uint32_t *new_state)
2792 {
2793         uint32_t state, prev;
2794         state = *new_state;
2795
2796         /* have to wait for interlock to clear */
2797         if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2798                 return 0;
2799         }
2800         prev = state;                                   /* prev contains snapshot for exchange */
2801         state |= LCK_MTX_ILOCKED_MSK | or_flags;        /* pick up interlock */
2802         disable_preemption();
2803         if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
2804                 *new_state = state;
2805                 return 1;
2806         }
2807
2808         enable_preemption();
2809         return 0;
2810 }
2811
2812 __attribute__((noinline))
2813 static void
2814 lck_mtx_lock_contended(
2815         lck_mtx_t       *lock,
2816         boolean_t indirect,
2817         boolean_t *first_miss)
2818 {
2819         lck_mtx_spinwait_ret_type_t ret;
2820         uint32_t state;
2821         thread_t thread;
2822         struct turnstile *ts = NULL;
2823
2824 try_again:
2825
2826         if (indirect) {
2827                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
2828         }
2829
2830         ret = lck_mtx_lock_spinwait_x86(lock);
2831         state = ordered_load_mtx_state(lock);
2832         switch (ret) {
2833         case LCK_MTX_SPINWAIT_NO_SPIN:
2834                 /*
2835                  * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2836                  * try to spin.
2837                  */
2838                 if (indirect) {
2839                         lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
2840                 }
2841
2842                 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2843                 OS_FALLTHROUGH;
2844         case LCK_MTX_SPINWAIT_SPUN_HIGH_THR:
2845         case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE:
2846         case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION:
2847         case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR:
2848                 /*
2849                  * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2850                  * interlock not held
2851                  */
2852                 lck_mtx_interlock_lock(lock, &state);
2853                 assert(state & LCK_MTX_ILOCKED_MSK);
2854
2855                 if (state & LCK_MTX_MLOCKED_MSK) {
2856                         if (indirect) {
2857                                 lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
2858                         }
2859                         lck_mtx_lock_wait_x86(lock, &ts);
2860                         /*
2861                          * interlock is not held here.
2862                          */
2863                         goto try_again;
2864                 } else {
2865                         /* grab the mutex */
2866                         state |= LCK_MTX_MLOCKED_MSK;
2867                         ordered_store_mtx_state_release(lock, state);
2868                         thread = current_thread();
2869                         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2870 #if     MACH_LDEBUG
2871                         if (thread) {
2872                                 thread->mutex_count++;
2873                         }
2874 #endif  /* MACH_LDEBUG */
2875                 }
2876
2877                 break;
2878         case LCK_MTX_SPINWAIT_ACQUIRED:
2879                 /*
2880                  * mutex has been acquired by lck_mtx_lock_spinwait_x86
2881                  * interlock is held and preemption disabled
2882                  * owner is set and mutex marked as locked
2883                  * statistics updated too
2884                  */
2885                 break;
2886         default:
2887                 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2888         }
2889
2890         /*
2891          * interlock is already acquired here
2892          */
2893
2894         /* mutex has been acquired */
2895         thread = (thread_t)lock->lck_mtx_owner;
2896         if (state & LCK_MTX_WAITERS_MSK) {
2897                 /*
2898                  * lck_mtx_lock_acquire_tail will call
2899                  * turnstile_complete.
2900                  */
2901                 return lck_mtx_lock_acquire_tail(lock, indirect, ts);
2902         }
2903
2904         if (ts != NULL) {
2905                 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2906         }
2907
2908         assert(current_thread()->turnstile != NULL);
2909
2910         /* release the interlock */
2911         lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock), indirect);
2912 }
2913
2914 /*
2915  * Helper noinline functions for calling
2916  * panic to optimize compiled code.
2917  */
2918
2919 __attribute__((noinline)) __abortlike
2920 static void
2921 lck_mtx_destroyed(
2922         lck_mtx_t       *lock)
2923 {
2924         panic("trying to interlock destroyed mutex (%p)", lock);
2925 }
2926
2927 __attribute__((noinline))
2928 static boolean_t
2929 lck_mtx_try_destroyed(
2930         lck_mtx_t       *lock)
2931 {
2932         panic("trying to interlock destroyed mutex (%p)", lock);
2933         return FALSE;
2934 }
2935
2936 __attribute__((always_inline))
2937 static boolean_t
2938 lck_mtx_lock_wait_interlock_to_clear(
2939         lck_mtx_t       *lock,
2940         uint32_t*        new_state)
2941 {
2942         uint32_t state;
2943
2944         for (;;) {
2945                 cpu_pause();
2946                 state = ordered_load_mtx_state(lock);
2947                 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2948                         *new_state = state;
2949                         return TRUE;
2950                 }
2951                 if (state & LCK_MTX_MLOCKED_MSK) {
2952                         /* if it is held as mutex, just fail */
2953                         return FALSE;
2954                 }
2955         }
2956 }
2957
2958 __attribute__((always_inline))
2959 static boolean_t
2960 lck_mtx_try_lock_wait_interlock_to_clear(
2961         lck_mtx_t       *lock,
2962         uint32_t*        new_state)
2963 {
2964         uint32_t state;
2965
2966         for (;;) {
2967                 cpu_pause();
2968                 state = ordered_load_mtx_state(lock);
2969                 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2970                         /* if it is held as mutex or spin, just fail */
2971                         return FALSE;
2972                 }
2973                 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2974                         *new_state = state;
2975                         return TRUE;
2976                 }
2977         }
2978 }
2979
2980 /*
2981  * Routine:     lck_mtx_lock_slow
2982  *
2983  * Locks a mutex for current thread.
2984  * If the lock is contended this function might
2985  * sleep.
2986  *
2987  * Called with interlock not held.
2988  */
2989 __attribute__((noinline))
2990 void
2991 lck_mtx_lock_slow(
2992         lck_mtx_t       *lock)
2993 {
2994         boolean_t       indirect = FALSE;
2995         uint32_t        state;
2996         int             first_miss = 0;
2997
2998         state = ordered_load_mtx_state(lock);
2999
3000         /* is the interlock or mutex held */
3001         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3002                 /*
3003                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3004                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3005                  * set in state (state == lck_mtx_tag)
3006                  */
3007
3008
3009                 /* is the mutex already held and not indirect */
3010                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3011                         /* no, must have been the mutex */
3012                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3013                 }
3014
3015                 /* check to see if it is marked destroyed */
3016                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3017                         lck_mtx_destroyed(lock);
3018                 }
3019
3020                 /* Is this an indirect mutex? */
3021                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3022                         indirect = get_indirect_mutex(&lock, &state);
3023
3024                         first_miss = 0;
3025                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3026
3027                         if (state & LCK_MTX_SPIN_MSK) {
3028                                 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3029                                 assert(state & LCK_MTX_ILOCKED_MSK);
3030                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3031                         }
3032                 }
3033
3034                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3035                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3036                 }
3037         }
3038
3039         /* no - can't be INDIRECT, DESTROYED or locked */
3040         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
3041                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3042                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3043                 }
3044         }
3045
3046         /* lock and interlock acquired */
3047
3048         thread_t thread = current_thread();
3049         /* record owner of mutex */
3050         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3051
3052 #if MACH_LDEBUG
3053         if (thread) {
3054                 thread->mutex_count++;          /* lock statistic */
3055         }
3056 #endif
3057         /*
3058          * Check if there are waiters to
3059          * inherit their priority.
3060          */
3061         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3062                 return lck_mtx_lock_acquire_tail(lock, indirect, NULL);
3063         }
3064
3065         /* release the interlock */
3066         lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
3067
3068         return;
3069 }
3070
3071 __attribute__((noinline))
3072 boolean_t
3073 lck_mtx_try_lock_slow(
3074         lck_mtx_t       *lock)
3075 {
3076         boolean_t       indirect = FALSE;
3077         uint32_t        state;
3078         int             first_miss = 0;
3079
3080         state = ordered_load_mtx_state(lock);
3081
3082         /* is the interlock or mutex held */
3083         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3084                 /*
3085                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3086                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3087                  * set in state (state == lck_mtx_tag)
3088                  */
3089
3090                 /* is the mutex already held and not indirect */
3091                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3092                         return FALSE;
3093                 }
3094
3095                 /* check to see if it is marked destroyed */
3096                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3097                         lck_mtx_try_destroyed(lock);
3098                 }
3099
3100                 /* Is this an indirect mutex? */
3101                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3102                         indirect = get_indirect_mutex(&lock, &state);
3103
3104                         first_miss = 0;
3105                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3106                 }
3107
3108                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3109                         if (indirect) {
3110                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3111                         }
3112                         return FALSE;
3113                 }
3114         }
3115
3116         /* no - can't be INDIRECT, DESTROYED or locked */
3117         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
3118                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3119                         if (indirect) {
3120                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3121                         }
3122                         return FALSE;
3123                 }
3124         }
3125
3126         /* lock and interlock acquired */
3127
3128         thread_t thread = current_thread();
3129         /* record owner of mutex */
3130         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3131
3132 #if MACH_LDEBUG
3133         if (thread) {
3134                 thread->mutex_count++;          /* lock statistic */
3135         }
3136 #endif
3137         /*
3138          * Check if there are waiters to
3139          * inherit their priority.
3140          */
3141         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3142                 return lck_mtx_try_lock_acquire_tail(lock);
3143         }
3144
3145         /* release the interlock */
3146         lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
3147
3148         return TRUE;
3149 }
3150
3151 __attribute__((noinline))
3152 void
3153 lck_mtx_lock_spin_slow(
3154         lck_mtx_t       *lock)
3155 {
3156         boolean_t       indirect = FALSE;
3157         uint32_t        state;
3158         int             first_miss = 0;
3159
3160         state = ordered_load_mtx_state(lock);
3161
3162         /* is the interlock or mutex held */
3163         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3164                 /*
3165                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3166                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3167                  * set in state (state == lck_mtx_tag)
3168                  */
3169
3170
3171                 /* is the mutex already held and not indirect */
3172                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3173                         /* no, must have been the mutex */
3174                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3175                 }
3176
3177                 /* check to see if it is marked destroyed */
3178                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3179                         lck_mtx_destroyed(lock);
3180                 }
3181
3182                 /* Is this an indirect mutex? */
3183                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3184                         indirect = get_indirect_mutex(&lock, &state);
3185
3186                         first_miss = 0;
3187                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3188
3189                         if (state & LCK_MTX_SPIN_MSK) {
3190                                 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3191                                 assert(state & LCK_MTX_ILOCKED_MSK);
3192                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3193                         }
3194                 }
3195
3196                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3197                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3198                 }
3199         }
3200
3201         /* no - can't be INDIRECT, DESTROYED or locked */
3202         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3203                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3204                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3205                 }
3206         }
3207
3208         /* lock as spinlock and interlock acquired */
3209
3210         thread_t thread = current_thread();
3211         /* record owner of mutex */
3212         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3213
3214 #if MACH_LDEBUG
3215         if (thread) {
3216                 thread->mutex_count++;          /* lock statistic */
3217         }
3218 #endif
3219
3220 #if     CONFIG_DTRACE
3221         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3222 #endif
3223         /* return with the interlock held and preemption disabled */
3224         return;
3225 }
3226
3227 __attribute__((noinline))
3228 boolean_t
3229 lck_mtx_try_lock_spin_slow(
3230         lck_mtx_t       *lock)
3231 {
3232         boolean_t       indirect = FALSE;
3233         uint32_t        state;
3234         int             first_miss = 0;
3235
3236         state = ordered_load_mtx_state(lock);
3237
3238         /* is the interlock or mutex held */
3239         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3240                 /*
3241                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3242                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3243                  * set in state (state == lck_mtx_tag)
3244                  */
3245
3246                 /* is the mutex already held and not indirect */
3247                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3248                         return FALSE;
3249                 }
3250
3251                 /* check to see if it is marked destroyed */
3252                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3253                         lck_mtx_try_destroyed(lock);
3254                 }
3255
3256                 /* Is this an indirect mutex? */
3257                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3258                         indirect = get_indirect_mutex(&lock, &state);
3259
3260                         first_miss = 0;
3261                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3262                 }
3263
3264                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3265                         if (indirect) {
3266                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3267                         }
3268                         return FALSE;
3269                 }
3270         }
3271
3272         /* no - can't be INDIRECT, DESTROYED or locked */
3273         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3274                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3275                         if (indirect) {
3276                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3277                         }
3278                         return FALSE;
3279                 }
3280         }
3281
3282         /* lock and interlock acquired */
3283
3284         thread_t thread = current_thread();
3285         /* record owner of mutex */
3286         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3287
3288 #if MACH_LDEBUG
3289         if (thread) {
3290                 thread->mutex_count++;          /* lock statistic */
3291         }
3292 #endif
3293
3294 #if     CONFIG_DTRACE
3295         LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3296 #endif
3297         return TRUE;
3298 }
3299
3300 __attribute__((noinline))
3301 void
3302 lck_mtx_convert_spin(
3303         lck_mtx_t       *lock)
3304 {
3305         uint32_t state;
3306
3307         state = ordered_load_mtx_state(lock);
3308
3309         /* Is this an indirect mutex? */
3310         if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3311                 /* If so, take indirection */
3312                 get_indirect_mutex(&lock, &state);
3313         }
3314
3315         assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3316
3317         if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3318                 /* already owned as a mutex, just return */
3319                 return;
3320         }
3321
3322         assert(get_preemption_level() > 0);
3323         assert(state & LCK_MTX_ILOCKED_MSK);
3324         assert(state & LCK_MTX_SPIN_MSK);
3325
3326         /*
3327          * Check if there are waiters to
3328          * inherit their priority.
3329          */
3330         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3331                 return lck_mtx_convert_spin_acquire_tail(lock);
3332         }
3333
3334         lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3335
3336         return;
3337 }
3338
3339 static inline boolean_t
3340 lck_mtx_lock_grab_mutex(
3341         lck_mtx_t       *lock)
3342 {
3343         uint32_t state;
3344
3345         state = ordered_load_mtx_state(lock);
3346
3347         if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3348                 return FALSE;
3349         }
3350
3351         /* lock and interlock acquired */
3352
3353         thread_t thread = current_thread();
3354         /* record owner of mutex */
3355         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3356
3357 #if MACH_LDEBUG
3358         if (thread) {
3359                 thread->mutex_count++;          /* lock statistic */
3360         }
3361 #endif
3362         return TRUE;
3363 }
3364
3365 __attribute__((noinline))
3366 void
3367 lck_mtx_assert(
3368         lck_mtx_t       *lock,
3369         unsigned int    type)
3370 {
3371         thread_t thread, owner;
3372         uint32_t state;
3373
3374         thread = current_thread();
3375         state = ordered_load_mtx_state(lock);
3376
3377         if (state == LCK_MTX_TAG_INDIRECT) {
3378                 get_indirect_mutex(&lock, &state);
3379         }
3380
3381         owner = (thread_t)lock->lck_mtx_owner;
3382
3383         if (type == LCK_MTX_ASSERT_OWNED) {
3384                 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
3385                         panic("mutex (%p) not owned\n", lock);
3386                 }
3387         } else {
3388                 assert(type == LCK_MTX_ASSERT_NOTOWNED);
3389                 if (owner == thread) {
3390                         panic("mutex (%p) owned\n", lock);
3391                 }
3392         }
3393 }
3394
3395 /*
3396  * Routine:     lck_mtx_lock_spinwait_x86
3397  *
3398  * Invoked trying to acquire a mutex when there is contention but
3399  * the holder is running on another processor. We spin for up to a maximum
3400  * time waiting for the lock to be released.
3401  *
3402  * Called with the interlock unlocked.
3403  * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3404  * returns LCK_MTX_SPINWAIT_SPUN if we spun
3405  * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3406  */
3407 __attribute__((noinline))
3408 lck_mtx_spinwait_ret_type_t
3409 lck_mtx_lock_spinwait_x86(
3410         lck_mtx_t       *mutex)
3411 {
3412         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3413         thread_t        owner, prev_owner;
3414         uint64_t        window_deadline, sliding_deadline, high_deadline;
3415         uint64_t        start_time, cur_time, avg_hold_time, bias, delta;
3416         lck_mtx_spinwait_ret_type_t             retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
3417         int             loopcount = 0;
3418         int             total_hold_time_samples, window_hold_time_samples, unfairness;
3419         uint            i, prev_owner_cpu;
3420         bool            owner_on_core, adjust;
3421
3422         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
3423             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
3424
3425         start_time = mach_absolute_time();
3426         /*
3427          * window_deadline represents the "learning" phase.
3428          * The thread collects statistics about the lock during
3429          * window_deadline and then it makes a decision on whether to spin more
3430          * or block according to the concurrency behavior
3431          * observed.
3432          *
3433          * Every thread can spin at least low_MutexSpin.
3434          */
3435         window_deadline = start_time + low_MutexSpin;
3436         /*
3437          * Sliding_deadline is the adjusted spin deadline
3438          * computed after the "learning" phase.
3439          */
3440         sliding_deadline = window_deadline;
3441         /*
3442          * High_deadline is a hard deadline. No thread
3443          * can spin more than this deadline.
3444          */
3445         if (high_MutexSpin >= 0) {
3446                 high_deadline = start_time + high_MutexSpin;
3447         } else {
3448                 high_deadline = start_time + low_MutexSpin * real_ncpus;
3449         }
3450
3451         /*
3452          * Do not know yet which is the owner cpu.
3453          * Initialize prev_owner_cpu with next cpu.
3454          */
3455         prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
3456         total_hold_time_samples = 0;
3457         window_hold_time_samples = 0;
3458         avg_hold_time = 0;
3459         adjust = TRUE;
3460         bias = (os_hash_kernel_pointer(mutex) + cpu_number()) % real_ncpus;
3461
3462         prev_owner = (thread_t) mutex->lck_mtx_owner;
3463         /*
3464          * Spin while:
3465          *   - mutex is locked, and
3466          *   - it's locked as a spin lock, and
3467          *   - owner is running on another processor, and
3468          *   - we haven't spun for long enough.
3469          */
3470         do {
3471                 /*
3472                  * Try to acquire the lock.
3473                  */
3474                 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
3475                         retval = LCK_MTX_SPINWAIT_ACQUIRED;
3476                         break;
3477                 }
3478
3479                 cur_time = mach_absolute_time();
3480
3481                 /*
3482                  * Never spin past high_deadline.
3483                  */
3484                 if (cur_time >= high_deadline) {
3485                         retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
3486                         break;
3487                 }
3488
3489                 /*
3490                  * Check if owner is on core. If not block.
3491                  */
3492                 owner = (thread_t) mutex->lck_mtx_owner;
3493                 if (owner) {
3494                         i = prev_owner_cpu;
3495                         owner_on_core = FALSE;
3496
3497                         disable_preemption();
3498                         owner = (thread_t) mutex->lck_mtx_owner;
3499
3500                         /*
3501                          * For scalability we want to check if the owner is on core
3502                          * without locking the mutex interlock.
3503                          * If we do not lock the mutex interlock, the owner that we see might be
3504                          * invalid, so we cannot dereference it. Therefore we cannot check
3505                          * any field of the thread to tell us if it is on core.
3506                          * Check if the thread that is running on the other cpus matches the owner.
3507                          */
3508                         if (owner) {
3509                                 do {
3510                                         if ((cpu_data_ptr[i] != NULL) && (cpu_data_ptr[i]->cpu_active_thread == owner)) {
3511                                                 owner_on_core = TRUE;
3512                                                 break;
3513                                         }
3514                                         if (++i >= real_ncpus) {
3515                                                 i = 0;
3516                                         }
3517                                 } while (i != prev_owner_cpu);
3518                                 enable_preemption();
3519
3520                                 if (owner_on_core) {
3521                                         prev_owner_cpu = i;
3522                                 } else {
3523                                         prev_owner = owner;
3524                                         owner = (thread_t) mutex->lck_mtx_owner;
3525                                         if (owner == prev_owner) {
3526                                                 /*
3527                                                  * Owner is not on core.
3528                                                  * Stop spinning.
3529                                                  */
3530                                                 if (loopcount == 0) {
3531                                                         retval = LCK_MTX_SPINWAIT_NO_SPIN;
3532                                                 } else {
3533                                                         retval = LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE;
3534                                                 }
3535                                                 break;
3536                                         }
3537                                         /*
3538                                          * Fall through if the owner changed while we were scanning.
3539                                          * The new owner could potentially be on core, so loop
3540                                          * again.
3541                                          */
3542                                 }
3543                         } else {
3544                                 enable_preemption();
3545                         }
3546                 }
3547
3548                 /*
3549                  * Save how many times we see the owner changing.
3550                  * We can roughly estimate the mutex hold
3551                  * time and the fairness with that.
3552                  */
3553                 if (owner != prev_owner) {
3554                         prev_owner = owner;
3555                         total_hold_time_samples++;
3556                         window_hold_time_samples++;
3557                 }
3558
3559                 /*
3560                  * Learning window expired.
3561                  * Try to adjust the sliding_deadline.
3562                  */
3563                 if (cur_time >= window_deadline) {
3564                         /*
3565                          * If there was not contention during the window
3566                          * stop spinning.
3567                          */
3568                         if (window_hold_time_samples < 1) {
3569                                 retval = LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION;
3570                                 break;
3571                         }
3572
3573                         if (adjust) {
3574                                 /*
3575                                  * For a fair lock, we'd wait for at most (NCPU-1) periods,
3576                                  * but the lock is unfair, so let's try to estimate by how much.
3577                                  */
3578                                 unfairness = total_hold_time_samples / real_ncpus;
3579
3580                                 if (unfairness == 0) {
3581                                         /*
3582                                          * We observed the owner changing `total_hold_time_samples` times which
3583                                          * let us estimate the average hold time of this mutex for the duration
3584                                          * of the spin time.
3585                                          * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
3586                                          *
3587                                          * In this case spin at max avg_hold_time * (real_ncpus - 1)
3588                                          */
3589                                         delta = cur_time - start_time;
3590                                         sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
3591                                 } else {
3592                                         /*
3593                                          * In this case at least one of the other cpus was able to get the lock twice
3594                                          * while I was spinning.
3595                                          * We could spin longer but it won't necessarily help if the system is unfair.
3596                                          * Try to randomize the wait to reduce contention.
3597                                          *
3598                                          * We compute how much time we could potentially spin
3599                                          * and distribute it over the cpus.
3600                                          *
3601                                          * bias is an integer between 0 and real_ncpus.
3602                                          * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
3603                                          */
3604                                         delta = high_deadline - cur_time;
3605                                         sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
3606                                         adjust = FALSE;
3607                                 }
3608                         }
3609
3610                         window_deadline += low_MutexSpin;
3611                         window_hold_time_samples = 0;
3612                 }
3613
3614                 /*
3615                  * Stop spinning if we past
3616                  * the adjusted deadline.
3617                  */
3618                 if (cur_time >= sliding_deadline) {
3619                         retval = LCK_MTX_SPINWAIT_SPUN_SLIDING_THR;
3620                         break;
3621                 }
3622
3623                 if ((thread_t) mutex->lck_mtx_owner != NULL) {
3624                         cpu_pause();
3625                 }
3626
3627                 loopcount++;
3628         } while (TRUE);
3629
3630 #if     CONFIG_DTRACE
3631         /*
3632          * Note that we record a different probe id depending on whether
3633          * this is a direct or indirect mutex.  This allows us to
3634          * penalize only lock groups that have debug/stats enabled
3635          * with dtrace processing if desired.
3636          */
3637         if (__probable(mutex->lck_mtx_is_ext == 0)) {
3638                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3639                     mach_absolute_time() - start_time);
3640         } else {
3641                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3642                     mach_absolute_time() - start_time);
3643         }
3644         /* The lockstat acquire event is recorded by the assembly code beneath us. */
3645 #endif
3646
3647         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
3648             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
3649
3650         return retval;
3651 }
3652
3653
3654
3655 /*
3656  * Routine:     lck_mtx_lock_wait_x86
3657  *
3658  * Invoked in order to wait on contention.
3659  *
3660  * Called with the interlock locked and
3661  * preemption disabled...
3662  * returns it unlocked and with preemption enabled
3663  *
3664  * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3665  *      A runnable waiter can exist between wait and acquire
3666  *      without a waiters count being set.
3667  *      This allows us to never make a spurious wakeup call.
3668  *
3669  * Priority:
3670  *      This avoids taking the thread lock if the owning thread is the same priority.
3671  *      This optimizes the case of same-priority threads contending on a lock.
3672  *      However, that allows the owning thread to drop in priority while holding the lock,
3673  *      because there is no state that the priority change can notice that
3674  *      says that the targeted thread holds a contended mutex.
3675  *
3676  *      One possible solution: priority changes could look for some atomic tag
3677  *      on the thread saying 'holding contended lock', and then set up a promotion.
3678  *      Needs a story for dropping that promotion - the last contended unlock
3679  *      has to notice that this has happened.
3680  */
3681 __attribute__((noinline))
3682 void
3683 lck_mtx_lock_wait_x86(
3684         lck_mtx_t       *mutex,
3685         struct turnstile **ts)
3686 {
3687         thread_t self = current_thread();
3688
3689 #if     CONFIG_DTRACE
3690         uint64_t sleep_start = 0;
3691
3692         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3693                 sleep_start = mach_absolute_time();
3694         }
3695 #endif
3696         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3697
3698         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
3699             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3700             mutex->lck_mtx_waiters, 0, 0);
3701
3702         assert(self->waiting_for_mutex == NULL);
3703         self->waiting_for_mutex = mutex;
3704         mutex->lck_mtx_waiters++;
3705
3706         thread_t holder = (thread_t)mutex->lck_mtx_owner;
3707         assert(holder != NULL);
3708
3709         /*
3710          * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
3711          * the same turnstile while looping, the matching turnstile compleate will be called
3712          * by lck_mtx_lock_contended when finally acquiring the lock.
3713          */
3714         if (*ts == NULL) {
3715                 *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
3716         }
3717
3718         struct turnstile *turnstile = *ts;
3719         thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3720         turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
3721
3722         waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
3723
3724         lck_mtx_ilk_unlock(mutex);
3725
3726         turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
3727
3728         thread_block(THREAD_CONTINUE_NULL);
3729
3730         self->waiting_for_mutex = NULL;
3731
3732         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
3733             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3734             mutex->lck_mtx_waiters, 0, 0);
3735
3736 #if     CONFIG_DTRACE
3737         /*
3738          * Record the Dtrace lockstat probe for blocking, block time
3739          * measured from when we were entered.
3740          */
3741         if (sleep_start) {
3742                 if (mutex->lck_mtx_is_ext == 0) {
3743                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3744                             mach_absolute_time() - sleep_start);
3745                 } else {
3746                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3747                             mach_absolute_time() - sleep_start);
3748                 }
3749         }
3750 #endif
3751 }
3752
3753 /*
3754  *      Routine: kdp_lck_mtx_lock_spin_is_acquired
3755  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3756  *      Returns: TRUE if lock is acquired.
3757  */
3758 boolean_t
3759 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t     *lck)
3760 {
3761         if (not_in_kdp) {
3762                 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3763         }
3764
3765         if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3766                 return TRUE;
3767         }
3768
3769         return FALSE;
3770 }
3771
3772 void
3773 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3774 {
3775         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3776         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3777         thread_t holder   = (thread_t)mutex->lck_mtx_owner;
3778         waitinfo->owner   = thread_tid(holder);
3779 }
3780
3781 void
3782 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3783 {
3784         lck_rw_t *rwlck = NULL;
3785         switch (waitinfo->wait_type) {
3786         case kThreadWaitKernelRWLockRead:
3787                 rwlck = READ_EVENT_TO_RWLOCK(event);
3788                 break;
3789         case kThreadWaitKernelRWLockWrite:
3790         case kThreadWaitKernelRWLockUpgrade:
3791                 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3792                 break;
3793         default:
3794                 panic("%s was called with an invalid blocking type", __FUNCTION__);
3795                 break;
3796         }
3797         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3798         waitinfo->owner = 0;
3799 }