osfmk/i386/locks_i386.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  *      File:   kern/lock.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *      Date:   1985
  60  *
  61  *      Locking primitives implementation
  62  */
  63
  64 #define ATOMIC_PRIVATE 1
  65 #define LOCK_PRIVATE 1
  66
  67 #include <mach_ldebug.h>
  68
  69 #include <kern/lock_stat.h>
  70 #include <kern/locks.h>
  71 #include <kern/kalloc.h>
  72 #include <kern/misc_protos.h>
  73 #include <kern/thread.h>
  74 #include <kern/processor.h>
  75 #include <kern/cpu_data.h>
  76 #include <kern/cpu_number.h>
  77 #include <kern/sched_prim.h>
  78 #include <kern/xpr.h>
  79 #include <kern/debug.h>
  80 #include <string.h>
  81
  82 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
  83 #include <machine/atomic.h>
  84 #include <machine/machine_cpu.h>
  85 #include <i386/mp.h>
  86 #include <machine/atomic.h>
  87 #include <sys/kdebug.h>
  88 #include <i386/locks_i386_inlines.h>
  89
  90 #if     CONFIG_DTRACE
  91 #define DTRACE_RW_SHARED        0x0     //reader
  92 #define DTRACE_RW_EXCL          0x1     //writer
  93 #define DTRACE_NO_FLAG          0x0     //not applicable
  94 #endif /* CONFIG_DTRACE */
  95
  96 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  97 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
  98 #define LCK_RW_LCK_SHARED_CODE          0x102
  99 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
 100 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
 101 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 102
 103 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
 104 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
 105 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
 106 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
 107 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
 108 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
 109 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
 110 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
 111
 112
 113 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 114
 115 unsigned int LcksOpts = 0;
 116
 117 #if DEVELOPMENT || DEBUG
 118 unsigned int LckDisablePreemptCheck = 0;
 119 #endif
 120
 121 /* Forwards */
 122
 123 #if     USLOCK_DEBUG
 124 /*
 125  *      Perform simple lock checks.
 126  */
 127 int     uslock_check = 1;
 128 int     max_lock_loops  = 100000000;
 129 decl_simple_lock_data(extern, printf_lock)
 130 decl_simple_lock_data(extern, panic_lock)
 131 #endif  /* USLOCK_DEBUG */
 132
 133 extern unsigned int not_in_kdp;
 134
 135 /*
 136  *      We often want to know the addresses of the callers
 137  *      of the various lock routines.  However, this information
 138  *      is only used for debugging and statistics.
 139  */
 140 typedef void    *pc_t;
 141 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 142 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 143 #if     ANY_LOCK_DEBUG
 144 #define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
 145 #define DECL_PC(pc)     pc_t pc;
 146 #else   /* ANY_LOCK_DEBUG */
 147 #define DECL_PC(pc)
 148 #ifdef  lint
 149 /*
 150  *      Eliminate lint complaints about unused local pc variables.
 151  */
 152 #define OBTAIN_PC(pc)   ++pc
 153 #else   /* lint */
 154 #define OBTAIN_PC(pc)
 155 #endif  /* lint */
 156 #endif  /* USLOCK_DEBUG */
 157
 158 /*
 159  * atomic exchange API is a low level abstraction of the operations
 160  * to atomically read, modify, and write a pointer.  This abstraction works
 161  * for both Intel and ARMv8.1 compare and exchange atomic instructions as
 162  * well as the ARM exclusive instructions.
 163  *
 164  * atomic_exchange_begin() - begin exchange and retrieve current value
 165  * atomic_exchange_complete() - conclude an exchange
 166  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
 167  */
 168 static uint32_t
 169 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 170 {
 171         uint32_t        val;
 172
 173         (void)ord;                      // Memory order not used
 174         val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed);
 175         *previous = val;
 176         return val;
 177 }
 178
 179 static boolean_t
 180 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 181 {
 182         return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
 183 }
 184
 185 static void
 186 atomic_exchange_abort(void)
 187 {
 188 }
 189
 190 static boolean_t
 191 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 192 {
 193         uint32_t        value, prev;
 194
 195         for (;;) {
 196                 value = atomic_exchange_begin32(target, &prev, ord);
 197                 if (value & test_mask) {
 198                         if (wait) {
 199                                 cpu_pause();
 200                         } else {
 201                                 atomic_exchange_abort();
 202                         }
 203                         return FALSE;
 204                 }
 205                 value |= set_mask;
 206                 if (atomic_exchange_complete32(target, prev, value, ord)) {
 207                         return TRUE;
 208                 }
 209         }
 210 }
 211
 212 /*
 213  *      Portable lock package implementation of usimple_locks.
 214  */
 215
 216 #if     USLOCK_DEBUG
 217 #define USLDBG(stmt)    stmt
 218 void            usld_lock_init(usimple_lock_t, unsigned short);
 219 void            usld_lock_pre(usimple_lock_t, pc_t);
 220 void            usld_lock_post(usimple_lock_t, pc_t);
 221 void            usld_unlock(usimple_lock_t, pc_t);
 222 void            usld_lock_try_pre(usimple_lock_t, pc_t);
 223 void            usld_lock_try_post(usimple_lock_t, pc_t);
 224 int             usld_lock_common_checks(usimple_lock_t, char *);
 225 #else   /* USLOCK_DEBUG */
 226 #define USLDBG(stmt)
 227 #endif  /* USLOCK_DEBUG */
 228
 229 /*
 230  * Forward definitions
 231  */
 232
 233 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
 234 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
 235 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 236 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 237 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 238 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 239 void lck_rw_clear_promotions_x86(thread_t thread);
 240 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
 241 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
 242 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
 243 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, int prior_lock_state, boolean_t indirect);
 244 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
 245 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
 246 static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state);
 247 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
 248 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
 249 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
 250
 251
 252 /*
 253  *      Routine:        lck_spin_alloc_init
 254  */
 255 lck_spin_t *
 256 lck_spin_alloc_init(
 257         lck_grp_t       *grp,
 258         lck_attr_t      *attr)
 259 {
 260         lck_spin_t      *lck;
 261
 262         if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) {
 263                 lck_spin_init(lck, grp, attr);
 264         }
 265
 266         return lck;
 267 }
 268
 269 /*
 270  *      Routine:        lck_spin_free
 271  */
 272 void
 273 lck_spin_free(
 274         lck_spin_t      *lck,
 275         lck_grp_t       *grp)
 276 {
 277         lck_spin_destroy(lck, grp);
 278         kfree(lck, sizeof(lck_spin_t));
 279 }
 280
 281 /*
 282  *      Routine:        lck_spin_init
 283  */
 284 void
 285 lck_spin_init(
 286         lck_spin_t      *lck,
 287         lck_grp_t       *grp,
 288         __unused lck_attr_t     *attr)
 289 {
 290         usimple_lock_init((usimple_lock_t) lck, 0);
 291         lck_grp_reference(grp);
 292         lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 293 }
 294
 295 /*
 296  *      Routine:        lck_spin_destroy
 297  */
 298 void
 299 lck_spin_destroy(
 300         lck_spin_t      *lck,
 301         lck_grp_t       *grp)
 302 {
 303         if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
 304                 return;
 305         }
 306         lck->interlock = LCK_SPIN_TAG_DESTROYED;
 307         lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 308         lck_grp_deallocate(grp);
 309         return;
 310 }
 311
 312 /*
 313  *      Routine:        lck_spin_lock
 314  */
 315 void
 316 lck_spin_lock_grp(
 317         lck_spin_t      *lck,
 318         lck_grp_t       *grp)
 319 {
 320 #pragma unused(grp)
 321         usimple_lock((usimple_lock_t) lck, grp);
 322 }
 323
 324 void
 325 lck_spin_lock(
 326         lck_spin_t      *lck)
 327 {
 328         usimple_lock((usimple_lock_t) lck, NULL);
 329 }
 330
 331 /*
 332  *      Routine:        lck_spin_unlock
 333  */
 334 void
 335 lck_spin_unlock(
 336         lck_spin_t      *lck)
 337 {
 338         usimple_unlock((usimple_lock_t) lck);
 339 }
 340
 341 boolean_t
 342 lck_spin_try_lock_grp(
 343         lck_spin_t      *lck,
 344         lck_grp_t       *grp)
 345 {
 346 #pragma unused(grp)
 347         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
 348 #if     DEVELOPMENT || DEBUG
 349         if (lrval) {
 350                 pltrace(FALSE);
 351         }
 352 #endif
 353         return lrval;
 354 }
 355
 356
 357 /*
 358  *      Routine:        lck_spin_try_lock
 359  */
 360 boolean_t
 361 lck_spin_try_lock(
 362         lck_spin_t      *lck)
 363 {
 364         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
 365 #if     DEVELOPMENT || DEBUG
 366         if (lrval) {
 367                 pltrace(FALSE);
 368         }
 369 #endif
 370         return lrval;
 371 }
 372
 373 /*
 374  *      Routine:        lck_spin_assert
 375  */
 376 void
 377 lck_spin_assert(lck_spin_t *lock, unsigned int type)
 378 {
 379         thread_t thread, holder;
 380         uintptr_t state;
 381
 382         if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
 383                 panic("lck_spin_assert(): invalid arg (%u)", type);
 384         }
 385
 386         state = lock->interlock;
 387         holder = (thread_t)state;
 388         thread = current_thread();
 389         if (type == LCK_ASSERT_OWNED) {
 390                 if (__improbable(holder == THREAD_NULL)) {
 391                         panic("Lock not owned %p = %lx", lock, state);
 392                 }
 393                 if (__improbable(holder != thread)) {
 394                         panic("Lock not owned by current thread %p = %lx", lock, state);
 395                 }
 396         } else if (type == LCK_ASSERT_NOTOWNED) {
 397                 if (__improbable(holder != THREAD_NULL)) {
 398                         if (holder == thread) {
 399                                 panic("Lock owned by current thread %p = %lx", lock, state);
 400                         } else {
 401                                 panic("Lock %p owned by thread %p", lock, holder);
 402                         }
 403                 }
 404         }
 405 }
 406
 407 /*
 408  *      Routine: kdp_lck_spin_is_acquired
 409  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 410  *      Returns: TRUE if lock is acquired.
 411  */
 412 boolean_t
 413 kdp_lck_spin_is_acquired(lck_spin_t *lck)
 414 {
 415         if (not_in_kdp) {
 416                 panic("panic: spinlock acquired check done outside of kernel debugger");
 417         }
 418         return (lck->interlock != 0)? TRUE : FALSE;
 419 }
 420
 421 /*
 422  *      Initialize a usimple_lock.
 423  *
 424  *      No change in preemption state.
 425  */
 426 void
 427 usimple_lock_init(
 428         usimple_lock_t  l,
 429         __unused unsigned short tag)
 430 {
 431 #ifndef MACHINE_SIMPLE_LOCK
 432         USLDBG(usld_lock_init(l, tag));
 433         hw_lock_init(&l->interlock);
 434 #else
 435         simple_lock_init((simple_lock_t)l, tag);
 436 #endif
 437 }
 438
 439 volatile uint32_t spinlock_owner_cpu = ~0;
 440 volatile usimple_lock_t spinlock_timed_out;
 441
 442 uint32_t
 443 spinlock_timeout_NMI(uintptr_t thread_addr)
 444 {
 445         uint32_t i;
 446
 447         for (i = 0; i < real_ncpus; i++) {
 448                 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
 449                         spinlock_owner_cpu = i;
 450                         if ((uint32_t) cpu_number() != i) {
 451                                 /* Cause NMI and panic on the owner's cpu */
 452                                 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
 453                         }
 454                         break;
 455                 }
 456         }
 457
 458         return spinlock_owner_cpu;
 459 }
 460
 461 /*
 462  *      Acquire a usimple_lock.
 463  *
 464  *      Returns with preemption disabled.  Note
 465  *      that the hw_lock routines are responsible for
 466  *      maintaining preemption state.
 467  */
 468 void
 469 (usimple_lock)(
 470         usimple_lock_t  l
 471         LCK_GRP_ARG(lck_grp_t *grp))
 472 {
 473 #ifndef MACHINE_SIMPLE_LOCK
 474         DECL_PC(pc);
 475
 476         OBTAIN_PC(pc);
 477         USLDBG(usld_lock_pre(l, pc));
 478
 479         if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
 480                 boolean_t uslock_acquired = FALSE;
 481                 while (machine_timeout_suspended()) {
 482                         enable_preemption();
 483                         if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) {
 484                                 break;
 485                         }
 486                 }
 487
 488                 if (uslock_acquired == FALSE) {
 489                         uint32_t lock_cpu;
 490                         uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
 491                         spinlock_timed_out = l;
 492                         lock_cpu = spinlock_timeout_NMI(lowner);
 493                         panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
 494                             l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
 495                 }
 496         }
 497 #if DEVELOPMENT || DEBUG
 498         pltrace(FALSE);
 499 #endif
 500
 501         USLDBG(usld_lock_post(l, pc));
 502 #else
 503         simple_lock((simple_lock_t)l, grp);
 504 #endif
 505 #if CONFIG_DTRACE
 506         LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
 507 #endif
 508 }
 509
 510
 511 /*
 512  *      Release a usimple_lock.
 513  *
 514  *      Returns with preemption enabled.  Note
 515  *      that the hw_lock routines are responsible for
 516  *      maintaining preemption state.
 517  */
 518 void
 519 usimple_unlock(
 520         usimple_lock_t  l)
 521 {
 522 #ifndef MACHINE_SIMPLE_LOCK
 523         DECL_PC(pc);
 524
 525         OBTAIN_PC(pc);
 526         USLDBG(usld_unlock(l, pc));
 527 #if DEVELOPMENT || DEBUG
 528         pltrace(TRUE);
 529 #endif
 530         hw_lock_unlock(&l->interlock);
 531 #else
 532         simple_unlock_rwmb((simple_lock_t)l);
 533 #endif
 534 }
 535
 536
 537 /*
 538  *      Conditionally acquire a usimple_lock.
 539  *
 540  *      On success, returns with preemption disabled.
 541  *      On failure, returns with preemption in the same state
 542  *      as when first invoked.  Note that the hw_lock routines
 543  *      are responsible for maintaining preemption state.
 544  *
 545  *      XXX No stats are gathered on a miss; I preserved this
 546  *      behavior from the original assembly-language code, but
 547  *      doesn't it make sense to log misses?  XXX
 548  */
 549 unsigned int
 550 usimple_lock_try(
 551         usimple_lock_t  l,
 552         lck_grp_t *grp)
 553 {
 554 #ifndef MACHINE_SIMPLE_LOCK
 555         unsigned int    success;
 556         DECL_PC(pc);
 557
 558         OBTAIN_PC(pc);
 559         USLDBG(usld_lock_try_pre(l, pc));
 560         if ((success = hw_lock_try(&l->interlock, grp))) {
 561 #if DEVELOPMENT || DEBUG
 562                 pltrace(FALSE);
 563 #endif
 564                 USLDBG(usld_lock_try_post(l, pc));
 565         }
 566         return success;
 567 #else
 568         return simple_lock_try((simple_lock_t)l, grp);
 569 #endif
 570 }
 571
 572 /*
 573  * Acquire a usimple_lock while polling for pending TLB flushes
 574  * and spinning on a lock.
 575  *
 576  */
 577 void
 578 usimple_lock_try_lock_loop(usimple_lock_t l, lck_grp_t *grp)
 579 {
 580         boolean_t istate = ml_get_interrupts_enabled();
 581         while (!simple_lock_try(l, grp)) {
 582                 if (!istate) {
 583                         handle_pending_TLB_flushes();
 584                 }
 585                 cpu_pause();
 586         }
 587 }
 588
 589 #if     USLOCK_DEBUG
 590 /*
 591  *      States of a usimple_lock.  The default when initializing
 592  *      a usimple_lock is setting it up for debug checking.
 593  */
 594 #define USLOCK_CHECKED          0x0001          /* lock is being checked */
 595 #define USLOCK_TAKEN            0x0002          /* lock has been taken */
 596 #define USLOCK_INIT             0xBAA0          /* lock has been initialized */
 597 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 598 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 599                                  ((l)->debug.state & USLOCK_CHECKED))
 600
 601 /*
 602  *      Trace activities of a particularly interesting lock.
 603  */
 604 void    usl_trace(usimple_lock_t, int, pc_t, const char *);
 605
 606
 607 /*
 608  *      Initialize the debugging information contained
 609  *      in a usimple_lock.
 610  */
 611 void
 612 usld_lock_init(
 613         usimple_lock_t  l,
 614         __unused unsigned short tag)
 615 {
 616         if (l == USIMPLE_LOCK_NULL) {
 617                 panic("lock initialization:  null lock pointer");
 618         }
 619         l->lock_type = USLOCK_TAG;
 620         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 621         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 622         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 623         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 624         l->debug.duration[0] = l->debug.duration[1] = 0;
 625         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 626         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 627         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 628 }
 629
 630
 631 /*
 632  *      These checks apply to all usimple_locks, not just
 633  *      those with USLOCK_CHECKED turned on.
 634  */
 635 int
 636 usld_lock_common_checks(
 637         usimple_lock_t  l,
 638         char            *caller)
 639 {
 640         if (l == USIMPLE_LOCK_NULL) {
 641                 panic("%s:  null lock pointer", caller);
 642         }
 643         if (l->lock_type != USLOCK_TAG) {
 644                 panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
 645         }
 646         if (!(l->debug.state & USLOCK_INIT)) {
 647                 panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
 648         }
 649         return USLOCK_CHECKING(l);
 650 }
 651
 652
 653 /*
 654  *      Debug checks on a usimple_lock just before attempting
 655  *      to acquire it.
 656  */
 657 /* ARGSUSED */
 658 void
 659 usld_lock_pre(
 660         usimple_lock_t  l,
 661         pc_t            pc)
 662 {
 663         char    caller[] = "usimple_lock";
 664
 665
 666         if (!usld_lock_common_checks(l, caller)) {
 667                 return;
 668         }
 669
 670 /*
 671  *      Note that we have a weird case where we are getting a lock when we are]
 672  *      in the process of putting the system to sleep. We are running with no
 673  *      current threads, therefore we can't tell if we are trying to retake a lock
 674  *      we have or someone on the other processor has it.  Therefore we just
 675  *      ignore this test if the locking thread is 0.
 676  */
 677
 678         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 679             l->debug.lock_thread == (void *) current_thread()) {
 680                 printf("%s:  lock %p already locked (at %p) by",
 681                     caller, l, l->debug.lock_pc);
 682                 printf(" current thread %p (new attempt at pc %p)\n",
 683                     l->debug.lock_thread, pc);
 684                 panic("%s", caller);
 685         }
 686         mp_disable_preemption();
 687         usl_trace(l, cpu_number(), pc, caller);
 688         mp_enable_preemption();
 689 }
 690
 691
 692 /*
 693  *      Debug checks on a usimple_lock just after acquiring it.
 694  *
 695  *      Pre-emption has been disabled at this point,
 696  *      so we are safe in using cpu_number.
 697  */
 698 void
 699 usld_lock_post(
 700         usimple_lock_t  l,
 701         pc_t            pc)
 702 {
 703         int     mycpu;
 704         char    caller[] = "successful usimple_lock";
 705
 706
 707         if (!usld_lock_common_checks(l, caller)) {
 708                 return;
 709         }
 710
 711         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
 712                 panic("%s:  lock %p became uninitialized",
 713                     caller, l);
 714         }
 715         if ((l->debug.state & USLOCK_TAKEN)) {
 716                 panic("%s:  lock 0x%p became TAKEN by someone else",
 717                     caller, l);
 718         }
 719
 720         mycpu = cpu_number();
 721         l->debug.lock_thread = (void *)current_thread();
 722         l->debug.state |= USLOCK_TAKEN;
 723         l->debug.lock_pc = pc;
 724         l->debug.lock_cpu = mycpu;
 725
 726         usl_trace(l, mycpu, pc, caller);
 727 }
 728
 729
 730 /*
 731  *      Debug checks on a usimple_lock just before
 732  *      releasing it.  Note that the caller has not
 733  *      yet released the hardware lock.
 734  *
 735  *      Preemption is still disabled, so there's
 736  *      no problem using cpu_number.
 737  */
 738 void
 739 usld_unlock(
 740         usimple_lock_t  l,
 741         pc_t            pc)
 742 {
 743         int     mycpu;
 744         char    caller[] = "usimple_unlock";
 745
 746
 747         if (!usld_lock_common_checks(l, caller)) {
 748                 return;
 749         }
 750
 751         mycpu = cpu_number();
 752
 753         if (!(l->debug.state & USLOCK_TAKEN)) {
 754                 panic("%s:  lock 0x%p hasn't been taken",
 755                     caller, l);
 756         }
 757         if (l->debug.lock_thread != (void *) current_thread()) {
 758                 panic("%s:  unlocking lock 0x%p, owned by thread %p",
 759                     caller, l, l->debug.lock_thread);
 760         }
 761         if (l->debug.lock_cpu != mycpu) {
 762                 printf("%s:  unlocking lock 0x%p on cpu 0x%x",
 763                     caller, l, mycpu);
 764                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 765                 panic("%s", caller);
 766         }
 767         usl_trace(l, mycpu, pc, caller);
 768
 769         l->debug.unlock_thread = l->debug.lock_thread;
 770         l->debug.lock_thread = INVALID_PC;
 771         l->debug.state &= ~USLOCK_TAKEN;
 772         l->debug.unlock_pc = pc;
 773         l->debug.unlock_cpu = mycpu;
 774 }
 775
 776
 777 /*
 778  *      Debug checks on a usimple_lock just before
 779  *      attempting to acquire it.
 780  *
 781  *      Preemption isn't guaranteed to be disabled.
 782  */
 783 void
 784 usld_lock_try_pre(
 785         usimple_lock_t  l,
 786         pc_t            pc)
 787 {
 788         char    caller[] = "usimple_lock_try";
 789
 790         if (!usld_lock_common_checks(l, caller)) {
 791                 return;
 792         }
 793         mp_disable_preemption();
 794         usl_trace(l, cpu_number(), pc, caller);
 795         mp_enable_preemption();
 796 }
 797
 798
 799 /*
 800  *      Debug checks on a usimple_lock just after
 801  *      successfully attempting to acquire it.
 802  *
 803  *      Preemption has been disabled by the
 804  *      lock acquisition attempt, so it's safe
 805  *      to use cpu_number.
 806  */
 807 void
 808 usld_lock_try_post(
 809         usimple_lock_t  l,
 810         pc_t            pc)
 811 {
 812         int     mycpu;
 813         char    caller[] = "successful usimple_lock_try";
 814
 815         if (!usld_lock_common_checks(l, caller)) {
 816                 return;
 817         }
 818
 819         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
 820                 panic("%s:  lock 0x%p became uninitialized",
 821                     caller, l);
 822         }
 823         if ((l->debug.state & USLOCK_TAKEN)) {
 824                 panic("%s:  lock 0x%p became TAKEN by someone else",
 825                     caller, l);
 826         }
 827
 828         mycpu = cpu_number();
 829         l->debug.lock_thread = (void *) current_thread();
 830         l->debug.state |= USLOCK_TAKEN;
 831         l->debug.lock_pc = pc;
 832         l->debug.lock_cpu = mycpu;
 833
 834         usl_trace(l, mycpu, pc, caller);
 835 }
 836
 837
 838 /*
 839  *      For very special cases, set traced_lock to point to a
 840  *      specific lock of interest.  The result is a series of
 841  *      XPRs showing lock operations on that lock.  The lock_seq
 842  *      value is used to show the order of those operations.
 843  */
 844 usimple_lock_t          traced_lock;
 845 unsigned int            lock_seq;
 846
 847 void
 848 usl_trace(
 849         usimple_lock_t  l,
 850         int             mycpu,
 851         pc_t            pc,
 852         const char *    op_name)
 853 {
 854         if (traced_lock == l) {
 855                 XPR(XPR_SLOCK,
 856                     "seq %d, cpu %d, %s @ %x\n",
 857                     (uintptr_t) lock_seq, (uintptr_t) mycpu,
 858                     (uintptr_t) op_name, (uintptr_t) pc, 0);
 859                 lock_seq++;
 860         }
 861 }
 862
 863
 864 #endif  /* USLOCK_DEBUG */
 865
 866 /*
 867  *      Routine:        lck_rw_alloc_init
 868  */
 869 lck_rw_t *
 870 lck_rw_alloc_init(
 871         lck_grp_t       *grp,
 872         lck_attr_t      *attr)
 873 {
 874         lck_rw_t        *lck;
 875
 876         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 877                 bzero(lck, sizeof(lck_rw_t));
 878                 lck_rw_init(lck, grp, attr);
 879         }
 880
 881         return lck;
 882 }
 883
 884 /*
 885  *      Routine:        lck_rw_free
 886  */
 887 void
 888 lck_rw_free(
 889         lck_rw_t        *lck,
 890         lck_grp_t       *grp)
 891 {
 892         lck_rw_destroy(lck, grp);
 893         kfree(lck, sizeof(lck_rw_t));
 894 }
 895
 896 /*
 897  *      Routine:        lck_rw_init
 898  */
 899 void
 900 lck_rw_init(
 901         lck_rw_t        *lck,
 902         lck_grp_t       *grp,
 903         lck_attr_t      *attr)
 904 {
 905         lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
 906             attr : &LockDefaultLckAttr;
 907
 908         hw_lock_byte_init(&lck->lck_rw_interlock);
 909         lck->lck_rw_want_write = FALSE;
 910         lck->lck_rw_want_upgrade = FALSE;
 911         lck->lck_rw_shared_count = 0;
 912         lck->lck_rw_can_sleep = TRUE;
 913         lck->lck_r_waiting = lck->lck_w_waiting = 0;
 914         lck->lck_rw_tag = 0;
 915         lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
 916             LCK_ATTR_RW_SHARED_PRIORITY) == 0);
 917
 918         lck_grp_reference(grp);
 919         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 920 }
 921
 922 /*
 923  *      Routine:        lck_rw_destroy
 924  */
 925 void
 926 lck_rw_destroy(
 927         lck_rw_t        *lck,
 928         lck_grp_t       *grp)
 929 {
 930         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
 931                 return;
 932         }
 933 #if MACH_LDEBUG
 934         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 935 #endif
 936         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 937         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 938         lck_grp_deallocate(grp);
 939         return;
 940 }
 941
 942 /*
 943  *      Sleep locks.  These use the same data structure and algorithm
 944  *      as the spin locks, but the process sleeps while it is waiting
 945  *      for the lock.  These work on uniprocessor systems.
 946  */
 947
 948 #define DECREMENTER_TIMEOUT 1000000
 949
 950 /*
 951  * We disable interrupts while holding the RW interlock to prevent an
 952  * interrupt from exacerbating hold time.
 953  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 954  */
 955 static inline boolean_t
 956 lck_interlock_lock(lck_rw_t *lck)
 957 {
 958         boolean_t       istate;
 959
 960         istate = ml_set_interrupts_enabled(FALSE);
 961         hw_lock_byte_lock(&lck->lck_rw_interlock);
 962         return istate;
 963 }
 964
 965 static inline void
 966 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 967 {
 968         hw_lock_byte_unlock(&lck->lck_rw_interlock);
 969         ml_set_interrupts_enabled(istate);
 970 }
 971
 972 /*
 973  * This inline is used when busy-waiting for an rw lock.
 974  * If interrupts were disabled when the lock primitive was called,
 975  * we poll the IPI handler for pending tlb flushes.
 976  * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
 977  */
 978 static inline void
 979 lck_rw_lock_pause(boolean_t interrupts_enabled)
 980 {
 981         if (!interrupts_enabled) {
 982                 handle_pending_TLB_flushes();
 983         }
 984         cpu_pause();
 985 }
 986
 987 static inline boolean_t
 988 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
 989 {
 990         if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) {
 991                 return TRUE;
 992         }
 993         return FALSE;
 994 }
 995
 996 /*
 997  * compute the deadline to spin against when
 998  * waiting for a change of state on a lck_rw_t
 999  */
1000 static inline uint64_t
1001 lck_rw_deadline_for_spin(lck_rw_t *lck)
1002 {
1003         if (lck->lck_rw_can_sleep) {
1004                 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
1005                         /*
1006                          * there are already threads waiting on this lock... this
1007                          * implies that they have spun beyond their deadlines waiting for
1008                          * the desired state to show up so we will not bother spinning at this time...
1009                          *   or
1010                          * the current number of threads sharing this lock exceeds our capacity to run them
1011                          * concurrently and since all states we're going to spin for require the rw_shared_count
1012                          * to be at 0, we'll not bother spinning since the latency for this to happen is
1013                          * unpredictable...
1014                          */
1015                         return mach_absolute_time();
1016                 }
1017                 return mach_absolute_time() + MutexSpin;
1018         } else {
1019                 return mach_absolute_time() + (1LL * 1000000000LL);
1020         }
1021 }
1022
1023
1024 /*
1025  * Spin while interlock is held.
1026  */
1027
1028 static inline void
1029 lck_rw_interlock_spin(lck_rw_t *lock)
1030 {
1031         while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
1032                 cpu_pause();
1033         }
1034 }
1035
1036 static boolean_t
1037 lck_rw_grab_want(lck_rw_t *lock)
1038 {
1039         uint32_t        data, prev;
1040
1041         for (;;) {
1042                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
1043                 if ((data & LCK_RW_INTERLOCK) == 0) {
1044                         break;
1045                 }
1046                 atomic_exchange_abort();
1047                 lck_rw_interlock_spin(lock);
1048         }
1049         if (data & LCK_RW_WANT_WRITE) {
1050                 atomic_exchange_abort();
1051                 return FALSE;
1052         }
1053         data |= LCK_RW_WANT_WRITE;
1054         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1055 }
1056
1057 static boolean_t
1058 lck_rw_grab_shared(lck_rw_t *lock)
1059 {
1060         uint32_t        data, prev;
1061
1062         for (;;) {
1063                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1064                 if ((data & LCK_RW_INTERLOCK) == 0) {
1065                         break;
1066                 }
1067                 atomic_exchange_abort();
1068                 lck_rw_interlock_spin(lock);
1069         }
1070         if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1071                 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1072                         atomic_exchange_abort();
1073                         return FALSE;
1074                 }
1075         }
1076         data += LCK_RW_SHARED_READER;
1077         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1078 }
1079
1080 /*
1081  *      Routine:        lck_rw_lock_exclusive
1082  */
1083 static void
1084 lck_rw_lock_exclusive_gen(
1085         lck_rw_t        *lck)
1086 {
1087         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1088         uint64_t        deadline = 0;
1089         int             slept = 0;
1090         int             gotlock = 0;
1091         int             lockheld = 0;
1092         wait_result_t   res = 0;
1093         boolean_t       istate = -1;
1094
1095 #if     CONFIG_DTRACE
1096         boolean_t dtrace_ls_initialized = FALSE;
1097         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
1098         uint64_t wait_interval = 0;
1099         int readers_at_sleep = 0;
1100 #endif
1101
1102         /*
1103          *      Try to acquire the lck_rw_want_write bit.
1104          */
1105         while (!lck_rw_grab_want(lck)) {
1106 #if     CONFIG_DTRACE
1107                 if (dtrace_ls_initialized == FALSE) {
1108                         dtrace_ls_initialized = TRUE;
1109                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1110                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1111                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1112                         if (dtrace_ls_enabled) {
1113                                 /*
1114                                  * Either sleeping or spinning is happening,
1115                                  *  start a timing of our delay interval now.
1116                                  */
1117                                 readers_at_sleep = lck->lck_rw_shared_count;
1118                                 wait_interval = mach_absolute_time();
1119                         }
1120                 }
1121 #endif
1122                 if (istate == -1) {
1123                         istate = ml_get_interrupts_enabled();
1124                 }
1125
1126                 deadline = lck_rw_deadline_for_spin(lck);
1127
1128                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1129
1130                 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) {
1131                         lck_rw_lock_pause(istate);
1132                 }
1133
1134                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1135
1136                 if (gotlock) {
1137                         break;
1138                 }
1139                 /*
1140                  * if we get here, the deadline has expired w/o us
1141                  * being able to grab the lock exclusively
1142                  * check to see if we're allowed to do a thread_block
1143                  */
1144                 if (lck->lck_rw_can_sleep) {
1145                         istate = lck_interlock_lock(lck);
1146
1147                         if (lck->lck_rw_want_write) {
1148                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1149
1150                                 lck->lck_w_waiting = TRUE;
1151
1152                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1153                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1154                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1155                                 lck_interlock_unlock(lck, istate);
1156
1157                                 if (res == THREAD_WAITING) {
1158                                         res = thread_block(THREAD_CONTINUE_NULL);
1159                                         slept++;
1160                                 }
1161                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1162                         } else {
1163                                 lck->lck_rw_want_write = TRUE;
1164                                 lck_interlock_unlock(lck, istate);
1165                                 break;
1166                         }
1167                 }
1168         }
1169         /*
1170          * Wait for readers (and upgrades) to finish...
1171          * the test for these conditions must be done simultaneously with
1172          * a check of the interlock not being held since
1173          * the rw_shared_count will drop to 0 first and then want_upgrade
1174          * will be set to 1 in the shared_to_exclusive scenario... those
1175          * adjustments are done behind the interlock and represent an
1176          * atomic change in state and must be considered as such
1177          * however, once we see the read count at 0, the want_upgrade not set
1178          * and the interlock not held, we are safe to proceed
1179          */
1180         while (lck_rw_held_read_or_upgrade(lck)) {
1181 #if     CONFIG_DTRACE
1182                 /*
1183                  * Either sleeping or spinning is happening, start
1184                  * a timing of our delay interval now.  If we set it
1185                  * to -1 we don't have accurate data so we cannot later
1186                  * decide to record a dtrace spin or sleep event.
1187                  */
1188                 if (dtrace_ls_initialized == FALSE) {
1189                         dtrace_ls_initialized = TRUE;
1190                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1191                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1192                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1193                         if (dtrace_ls_enabled) {
1194                                 /*
1195                                  * Either sleeping or spinning is happening,
1196                                  *  start a timing of our delay interval now.
1197                                  */
1198                                 readers_at_sleep = lck->lck_rw_shared_count;
1199                                 wait_interval = mach_absolute_time();
1200                         }
1201                 }
1202 #endif
1203                 if (istate == -1) {
1204                         istate = ml_get_interrupts_enabled();
1205                 }
1206
1207                 deadline = lck_rw_deadline_for_spin(lck);
1208
1209                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1210
1211                 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) {
1212                         lck_rw_lock_pause(istate);
1213                 }
1214
1215                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1216
1217                 if (!lockheld) {
1218                         break;
1219                 }
1220                 /*
1221                  * if we get here, the deadline has expired w/o us
1222                  * being able to grab the lock exclusively
1223                  * check to see if we're allowed to do a thread_block
1224                  */
1225                 if (lck->lck_rw_can_sleep) {
1226                         istate = lck_interlock_lock(lck);
1227
1228                         if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1229                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1230
1231                                 lck->lck_w_waiting = TRUE;
1232
1233                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1234                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1235                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1236                                 lck_interlock_unlock(lck, istate);
1237
1238                                 if (res == THREAD_WAITING) {
1239                                         res = thread_block(THREAD_CONTINUE_NULL);
1240                                         slept++;
1241                                 }
1242                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1243                         } else {
1244                                 lck_interlock_unlock(lck, istate);
1245                                 /*
1246                                  * must own the lock now, since we checked for
1247                                  * readers or upgrade owner behind the interlock
1248                                  * no need for a call to 'lck_rw_held_read_or_upgrade'
1249                                  */
1250                                 break;
1251                         }
1252                 }
1253         }
1254
1255 #if     CONFIG_DTRACE
1256         /*
1257          * Decide what latencies we suffered that are Dtrace events.
1258          * If we have set wait_interval, then we either spun or slept.
1259          * At least we get out from under the interlock before we record
1260          * which is the best we can do here to minimize the impact
1261          * of the tracing.
1262          * If we have set wait_interval to -1, then dtrace was not enabled when we
1263          * started sleeping/spinning so we don't record this event.
1264          */
1265         if (dtrace_ls_enabled == TRUE) {
1266                 if (slept == 0) {
1267                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1268                             mach_absolute_time() - wait_interval, 1);
1269                 } else {
1270                         /*
1271                          * For the blocking case, we also record if when we blocked
1272                          * it was held for read or write, and how many readers.
1273                          * Notice that above we recorded this before we dropped
1274                          * the interlock so the count is accurate.
1275                          */
1276                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1277                             mach_absolute_time() - wait_interval, 1,
1278                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1279                 }
1280         }
1281         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1282 #endif
1283 }
1284
1285 /*
1286  *      Routine:        lck_rw_done
1287  */
1288
1289 lck_rw_type_t
1290 lck_rw_done(lck_rw_t *lock)
1291 {
1292         uint32_t        data, prev;
1293
1294         for (;;) {
1295                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1296                 if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
1297                         atomic_exchange_abort();
1298                         lck_rw_interlock_spin(lock);
1299                         continue;
1300                 }
1301                 if (data & LCK_RW_SHARED_MASK) {
1302                         data -= LCK_RW_SHARED_READER;
1303                         if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
1304                                 goto check_waiters;
1305                         }
1306                 } else {                                        /* if reader count == 0, must be exclusive lock */
1307                         if (data & LCK_RW_WANT_UPGRADE) {
1308                                 data &= ~(LCK_RW_WANT_UPGRADE);
1309                         } else {
1310                                 if (data & LCK_RW_WANT_WRITE) {
1311                                         data &= ~(LCK_RW_WANT_EXCL);
1312                                 } else {                                /* lock is not 'owned', panic */
1313                                         panic("Releasing non-exclusive RW lock without a reader refcount!");
1314                                 }
1315                         }
1316 check_waiters:
1317                         if (prev & LCK_RW_W_WAITING) {
1318                                 data &= ~(LCK_RW_W_WAITING);
1319                                 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
1320                                         data &= ~(LCK_RW_R_WAITING);
1321                                 }
1322                         } else {
1323                                 data &= ~(LCK_RW_R_WAITING);
1324                         }
1325                 }
1326                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
1327                         break;
1328                 }
1329                 cpu_pause();
1330         }
1331         return lck_rw_done_gen(lock, prev);
1332 }
1333
1334 /*
1335  *      Routine:        lck_rw_done_gen
1336  *
1337  *      called from lck_rw_done()
1338  *      prior_lock_state is the value in the 1st
1339  *      word of the lock at the time of a successful
1340  *      atomic compare and exchange with the new value...
1341  *      it represents the state of the lock before we
1342  *      decremented the rw_shared_count or cleared either
1343  *      rw_want_upgrade or rw_want_write and
1344  *      the lck_x_waiting bits...  since the wrapper
1345  *      routine has already changed the state atomically,
1346  *      we just need to decide if we should
1347  *      wake up anyone and what value to return... we do
1348  *      this by examining the state of the lock before
1349  *      we changed it
1350  */
1351 static lck_rw_type_t
1352 lck_rw_done_gen(
1353         lck_rw_t        *lck,
1354         uint32_t        prior_lock_state)
1355 {
1356         lck_rw_t        *fake_lck;
1357         lck_rw_type_t   lock_type;
1358         thread_t        thread;
1359         uint32_t        rwlock_count;
1360
1361         thread = current_thread();
1362         rwlock_count = thread->rwlock_count--;
1363         fake_lck = (lck_rw_t *)&prior_lock_state;
1364
1365         if (lck->lck_rw_can_sleep) {
1366                 /*
1367                  * prior_lock state is a snapshot of the 1st word of the
1368                  * lock in question... we'll fake up a pointer to it
1369                  * and carefully not access anything beyond whats defined
1370                  * in the first word of a lck_rw_t
1371                  */
1372
1373                 if (fake_lck->lck_rw_shared_count <= 1) {
1374                         if (fake_lck->lck_w_waiting) {
1375                                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1376                         }
1377
1378                         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1379                                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1380                         }
1381                 }
1382 #if MACH_LDEBUG
1383                 if (rwlock_count == 0) {
1384                         panic("rw lock count underflow for thread %p", thread);
1385                 }
1386 #endif
1387                 /* Check if dropping the lock means that we need to unpromote */
1388
1389                 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1390                         /* sched_flags checked without lock, but will be rechecked while clearing */
1391                         lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1392                 }
1393         }
1394         if (fake_lck->lck_rw_shared_count) {
1395                 lock_type = LCK_RW_TYPE_SHARED;
1396         } else {
1397                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1398         }
1399
1400 #if CONFIG_DTRACE
1401         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1402 #endif
1403
1404         return lock_type;
1405 }
1406
1407
1408 /*
1409  *      Routine:        lck_rw_unlock
1410  */
1411 void
1412 lck_rw_unlock(
1413         lck_rw_t        *lck,
1414         lck_rw_type_t   lck_rw_type)
1415 {
1416         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1417                 lck_rw_unlock_shared(lck);
1418         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1419                 lck_rw_unlock_exclusive(lck);
1420         } else {
1421                 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1422         }
1423 }
1424
1425
1426 /*
1427  *      Routine:        lck_rw_unlock_shared
1428  */
1429 void
1430 lck_rw_unlock_shared(
1431         lck_rw_t        *lck)
1432 {
1433         lck_rw_type_t   ret;
1434
1435         assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1436         ret = lck_rw_done(lck);
1437
1438         if (ret != LCK_RW_TYPE_SHARED) {
1439                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1440         }
1441 }
1442
1443
1444 /*
1445  *      Routine:        lck_rw_unlock_exclusive
1446  */
1447 void
1448 lck_rw_unlock_exclusive(
1449         lck_rw_t        *lck)
1450 {
1451         lck_rw_type_t   ret;
1452
1453         ret = lck_rw_done(lck);
1454
1455         if (ret != LCK_RW_TYPE_EXCLUSIVE) {
1456                 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1457         }
1458 }
1459
1460
1461 /*
1462  *      Routine:        lck_rw_lock
1463  */
1464 void
1465 lck_rw_lock(
1466         lck_rw_t        *lck,
1467         lck_rw_type_t   lck_rw_type)
1468 {
1469         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1470                 lck_rw_lock_shared(lck);
1471         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1472                 lck_rw_lock_exclusive(lck);
1473         } else {
1474                 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1475         }
1476 }
1477
1478 /*
1479  *      Routine:        lck_rw_lock_shared
1480  */
1481 void
1482 lck_rw_lock_shared(lck_rw_t *lock)
1483 {
1484         uint32_t        data, prev;
1485
1486         current_thread()->rwlock_count++;
1487         for (;;) {
1488                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1489                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1490                         atomic_exchange_abort();
1491                         if (lock->lck_rw_can_sleep) {
1492                                 lck_rw_lock_shared_gen(lock);
1493                         } else {
1494                                 cpu_pause();
1495                                 continue;
1496                         }
1497                         break;
1498                 }
1499                 data += LCK_RW_SHARED_READER;
1500                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1501                         break;
1502                 }
1503                 cpu_pause();
1504         }
1505 #if     CONFIG_DTRACE
1506         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1507 #endif  /* CONFIG_DTRACE */
1508         return;
1509 }
1510
1511 /*
1512  *      Routine:        lck_rw_lock_shared_gen
1513  *      Function:
1514  *              assembly fast path code has determined that this lock
1515  *              is held exclusively... this is where we spin/block
1516  *              until we can acquire the lock in the shared mode
1517  */
1518 static void
1519 lck_rw_lock_shared_gen(
1520         lck_rw_t        *lck)
1521 {
1522         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1523         uint64_t        deadline = 0;
1524         int             gotlock = 0;
1525         int             slept = 0;
1526         wait_result_t   res = 0;
1527         boolean_t       istate = -1;
1528
1529 #if     CONFIG_DTRACE
1530         uint64_t wait_interval = 0;
1531         int readers_at_sleep = 0;
1532         boolean_t dtrace_ls_initialized = FALSE;
1533         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1534 #endif
1535
1536         while (!lck_rw_grab_shared(lck)) {
1537 #if     CONFIG_DTRACE
1538                 if (dtrace_ls_initialized == FALSE) {
1539                         dtrace_ls_initialized = TRUE;
1540                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1541                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1542                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1543                         if (dtrace_ls_enabled) {
1544                                 /*
1545                                  * Either sleeping or spinning is happening,
1546                                  *  start a timing of our delay interval now.
1547                                  */
1548                                 readers_at_sleep = lck->lck_rw_shared_count;
1549                                 wait_interval = mach_absolute_time();
1550                         }
1551                 }
1552 #endif
1553                 if (istate == -1) {
1554                         istate = ml_get_interrupts_enabled();
1555                 }
1556
1557                 deadline = lck_rw_deadline_for_spin(lck);
1558
1559                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1560                     trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1561
1562                 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) {
1563                         lck_rw_lock_pause(istate);
1564                 }
1565
1566                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1567                     trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1568
1569                 if (gotlock) {
1570                         break;
1571                 }
1572                 /*
1573                  * if we get here, the deadline has expired w/o us
1574                  * being able to grab the lock for read
1575                  * check to see if we're allowed to do a thread_block
1576                  */
1577                 if (lck->lck_rw_can_sleep) {
1578                         istate = lck_interlock_lock(lck);
1579
1580                         if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1581                             ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1582                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1583                                     trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1584
1585                                 lck->lck_r_waiting = TRUE;
1586
1587                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1588                                 res = assert_wait(RW_LOCK_READER_EVENT(lck),
1589                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1590                                 lck_interlock_unlock(lck, istate);
1591
1592                                 if (res == THREAD_WAITING) {
1593                                         res = thread_block(THREAD_CONTINUE_NULL);
1594                                         slept++;
1595                                 }
1596                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1597                                     trace_lck, res, slept, 0, 0);
1598                         } else {
1599                                 lck->lck_rw_shared_count++;
1600                                 lck_interlock_unlock(lck, istate);
1601                                 break;
1602                         }
1603                 }
1604         }
1605
1606 #if     CONFIG_DTRACE
1607         if (dtrace_ls_enabled == TRUE) {
1608                 if (slept == 0) {
1609                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1610                 } else {
1611                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1612                             mach_absolute_time() - wait_interval, 0,
1613                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1614                 }
1615         }
1616         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1617 #endif
1618 }
1619
1620
1621 /*
1622  *      Routine:        lck_rw_lock_exclusive
1623  */
1624
1625 void
1626 lck_rw_lock_exclusive(lck_rw_t *lock)
1627 {
1628         current_thread()->rwlock_count++;
1629         if (atomic_test_and_set32(&lock->data,
1630             (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1631             LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1632 #if     CONFIG_DTRACE
1633                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1634 #endif  /* CONFIG_DTRACE */
1635         } else {
1636                 lck_rw_lock_exclusive_gen(lock);
1637         }
1638 }
1639
1640
1641 /*
1642  *      Routine:        lck_rw_lock_shared_to_exclusive
1643  */
1644
1645 boolean_t
1646 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1647 {
1648         uint32_t        data, prev;
1649
1650         for (;;) {
1651                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1652                 if (data & LCK_RW_INTERLOCK) {
1653                         atomic_exchange_abort();
1654                         lck_rw_interlock_spin(lock);
1655                         continue;
1656                 }
1657                 if (data & LCK_RW_WANT_UPGRADE) {
1658                         data -= LCK_RW_SHARED_READER;
1659                         if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1660                                 data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1661                         }
1662                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1663                                 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1664                         }
1665                 } else {
1666                         data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1667                         data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1668                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1669                                 break;
1670                         }
1671                 }
1672                 cpu_pause();
1673         }
1674         /* we now own the WANT_UPGRADE */
1675         if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1676                 lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1677         }
1678 #if     CONFIG_DTRACE
1679         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1680 #endif
1681         return TRUE;
1682 }
1683
1684
1685 /*
1686  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1687  *      Function:
1688  *              assembly fast path code has already dropped our read
1689  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1690  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1691  *              all we need to do here is determine if a wakeup is needed
1692  */
1693 static boolean_t
1694 lck_rw_lock_shared_to_exclusive_failure(
1695         lck_rw_t        *lck,
1696         uint32_t        prior_lock_state)
1697 {
1698         lck_rw_t        *fake_lck;
1699         thread_t        thread = current_thread();
1700         uint32_t        rwlock_count;
1701
1702         /* Check if dropping the lock means that we need to unpromote */
1703         rwlock_count = thread->rwlock_count--;
1704 #if MACH_LDEBUG
1705         if (rwlock_count == 0) {
1706                 panic("rw lock count underflow for thread %p", thread);
1707         }
1708 #endif
1709         fake_lck = (lck_rw_t *)&prior_lock_state;
1710
1711         if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1712                 /*
1713                  *      Someone else has requested upgrade.
1714                  *      Since we've released the read lock, wake
1715                  *      him up if he's blocked waiting
1716                  */
1717                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1718         }
1719
1720         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1721                 /* sched_flags checked without lock, but will be rechecked while clearing */
1722                 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1723         }
1724
1725         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1726             VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1727
1728         return FALSE;
1729 }
1730
1731
1732 /*
1733  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1734  *      Function:
1735  *              assembly fast path code has already dropped our read
1736  *              count and successfully acquired 'lck_rw_want_upgrade'
1737  *              we just need to wait for the rest of the readers to drain
1738  *              and then we can return as the exclusive holder of this lock
1739  */
1740 static boolean_t
1741 lck_rw_lock_shared_to_exclusive_success(
1742         lck_rw_t        *lck)
1743 {
1744         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1745         uint64_t        deadline = 0;
1746         int             slept = 0;
1747         int             still_shared = 0;
1748         wait_result_t   res;
1749         boolean_t       istate = -1;
1750
1751 #if     CONFIG_DTRACE
1752         uint64_t wait_interval = 0;
1753         int readers_at_sleep = 0;
1754         boolean_t dtrace_ls_initialized = FALSE;
1755         boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1756 #endif
1757
1758         while (lck->lck_rw_shared_count != 0) {
1759 #if     CONFIG_DTRACE
1760                 if (dtrace_ls_initialized == FALSE) {
1761                         dtrace_ls_initialized = TRUE;
1762                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1763                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1764                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1765                         if (dtrace_ls_enabled) {
1766                                 /*
1767                                  * Either sleeping or spinning is happening,
1768                                  *  start a timing of our delay interval now.
1769                                  */
1770                                 readers_at_sleep = lck->lck_rw_shared_count;
1771                                 wait_interval = mach_absolute_time();
1772                         }
1773                 }
1774 #endif
1775                 if (istate == -1) {
1776                         istate = ml_get_interrupts_enabled();
1777                 }
1778
1779                 deadline = lck_rw_deadline_for_spin(lck);
1780
1781                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1782                     trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1783
1784                 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) {
1785                         lck_rw_lock_pause(istate);
1786                 }
1787
1788                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1789                     trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1790
1791                 if (!still_shared) {
1792                         break;
1793                 }
1794                 /*
1795                  * if we get here, the deadline has expired w/o
1796                  * the rw_shared_count having drained to 0
1797                  * check to see if we're allowed to do a thread_block
1798                  */
1799                 if (lck->lck_rw_can_sleep) {
1800                         istate = lck_interlock_lock(lck);
1801
1802                         if (lck->lck_rw_shared_count != 0) {
1803                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1804                                     trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1805
1806                                 lck->lck_w_waiting = TRUE;
1807
1808                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1809                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1810                                     THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1811                                 lck_interlock_unlock(lck, istate);
1812
1813                                 if (res == THREAD_WAITING) {
1814                                         res = thread_block(THREAD_CONTINUE_NULL);
1815                                         slept++;
1816                                 }
1817                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1818                                     trace_lck, res, slept, 0, 0);
1819                         } else {
1820                                 lck_interlock_unlock(lck, istate);
1821                                 break;
1822                         }
1823                 }
1824         }
1825 #if     CONFIG_DTRACE
1826         /*
1827          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1828          */
1829         if (dtrace_ls_enabled == TRUE) {
1830                 if (slept == 0) {
1831                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1832                 } else {
1833                         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1834                             mach_absolute_time() - wait_interval, 1,
1835                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1836                 }
1837         }
1838         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1839 #endif
1840         return TRUE;
1841 }
1842
1843 /*
1844  *      Routine:        lck_rw_lock_exclusive_to_shared
1845  */
1846
1847 void
1848 lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1849 {
1850         uint32_t        data, prev;
1851
1852         for (;;) {
1853                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1854                 if (data & LCK_RW_INTERLOCK) {
1855                         atomic_exchange_abort();
1856                         lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1857                         continue;
1858                 }
1859                 data += LCK_RW_SHARED_READER;
1860                 if (data & LCK_RW_WANT_UPGRADE) {
1861                         data &= ~(LCK_RW_WANT_UPGRADE);
1862                 } else {
1863                         data &= ~(LCK_RW_WANT_EXCL);
1864                 }
1865                 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1866                         data &= ~(LCK_RW_W_WAITING);
1867                 }
1868                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
1869                         break;
1870                 }
1871                 cpu_pause();
1872         }
1873         return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1874 }
1875
1876
1877 /*
1878  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1879  *      Function:
1880  *              assembly fast path has already dropped
1881  *              our exclusive state and bumped lck_rw_shared_count
1882  *              all we need to do here is determine if anyone
1883  *              needs to be awakened.
1884  */
1885 static void
1886 lck_rw_lock_exclusive_to_shared_gen(
1887         lck_rw_t        *lck,
1888         uint32_t        prior_lock_state)
1889 {
1890         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1891         lck_rw_t                *fake_lck;
1892
1893         fake_lck = (lck_rw_t *)&prior_lock_state;
1894
1895         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1896             trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1897
1898         /*
1899          * don't wake up anyone waiting to take the lock exclusively
1900          * since we hold a read count... when the read count drops to 0,
1901          * the writers will be woken.
1902          *
1903          * wake up any waiting readers if we don't have any writers waiting,
1904          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1905          */
1906         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1907                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1908         }
1909
1910         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1911             trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1912
1913 #if CONFIG_DTRACE
1914         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1915 #endif
1916 }
1917
1918
1919 /*
1920  *      Routine:        lck_rw_try_lock
1921  */
1922 boolean_t
1923 lck_rw_try_lock(
1924         lck_rw_t        *lck,
1925         lck_rw_type_t   lck_rw_type)
1926 {
1927         if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1928                 return lck_rw_try_lock_shared(lck);
1929         } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1930                 return lck_rw_try_lock_exclusive(lck);
1931         } else {
1932                 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1933         }
1934         return FALSE;
1935 }
1936
1937 /*
1938  *      Routine:        lck_rw_try_lock_shared
1939  */
1940
1941 boolean_t
1942 lck_rw_try_lock_shared(lck_rw_t *lock)
1943 {
1944         uint32_t        data, prev;
1945
1946         for (;;) {
1947                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1948                 if (data & LCK_RW_INTERLOCK) {
1949                         atomic_exchange_abort();
1950                         lck_rw_interlock_spin(lock);
1951                         continue;
1952                 }
1953                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1954                         atomic_exchange_abort();
1955                         return FALSE;                   /* lock is busy */
1956                 }
1957                 data += LCK_RW_SHARED_READER;           /* Increment reader refcount */
1958                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1959                         break;
1960                 }
1961                 cpu_pause();
1962         }
1963         current_thread()->rwlock_count++;
1964         /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1965 #if     CONFIG_DTRACE
1966         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1967 #endif  /* CONFIG_DTRACE */
1968         return TRUE;
1969 }
1970
1971
1972 /*
1973  *      Routine:        lck_rw_try_lock_exclusive
1974  */
1975
1976 boolean_t
1977 lck_rw_try_lock_exclusive(lck_rw_t *lock)
1978 {
1979         uint32_t        data, prev;
1980
1981         for (;;) {
1982                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1983                 if (data & LCK_RW_INTERLOCK) {
1984                         atomic_exchange_abort();
1985                         lck_rw_interlock_spin(lock);
1986                         continue;
1987                 }
1988                 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1989                         atomic_exchange_abort();
1990                         return FALSE;                           /* can't get it */
1991                 }
1992                 data |= LCK_RW_WANT_EXCL;
1993                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1994                         break;
1995                 }
1996                 cpu_pause();
1997         }
1998
1999         current_thread()->rwlock_count++;
2000 #if     CONFIG_DTRACE
2001         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2002 #endif  /* CONFIG_DTRACE */
2003         return TRUE;
2004 }
2005
2006
2007 void
2008 lck_rw_assert(
2009         lck_rw_t        *lck,
2010         unsigned int    type)
2011 {
2012         switch (type) {
2013         case LCK_RW_ASSERT_SHARED:
2014                 if (lck->lck_rw_shared_count != 0) {
2015                         return;
2016                 }
2017                 break;
2018         case LCK_RW_ASSERT_EXCLUSIVE:
2019                 if ((lck->lck_rw_want_write ||
2020                     lck->lck_rw_want_upgrade) &&
2021                     lck->lck_rw_shared_count == 0) {
2022                         return;
2023                 }
2024                 break;
2025         case LCK_RW_ASSERT_HELD:
2026                 if (lck->lck_rw_want_write ||
2027                     lck->lck_rw_want_upgrade ||
2028                     lck->lck_rw_shared_count != 0) {
2029                         return;
2030                 }
2031                 break;
2032         case LCK_RW_ASSERT_NOTHELD:
2033                 if (!(lck->lck_rw_want_write ||
2034                     lck->lck_rw_want_upgrade ||
2035                     lck->lck_rw_shared_count != 0)) {
2036                         return;
2037                 }
2038                 break;
2039         default:
2040                 break;
2041         }
2042
2043         panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
2044 }
2045
2046 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
2047 void
2048 lck_rw_clear_promotions_x86(thread_t thread)
2049 {
2050 #if MACH_LDEBUG
2051         /* It's fatal to leave a RW lock locked and return to userspace */
2052         panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
2053 #else
2054         /* Paper over the issue */
2055         thread->rwlock_count = 0;
2056         lck_rw_clear_promotion(thread, 0);
2057 #endif
2058 }
2059
2060 boolean_t
2061 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2062 {
2063         lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2064
2065         if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
2066                 lck_rw_unlock_shared(lck);
2067                 mutex_pause(2);
2068                 lck_rw_lock_shared(lck);
2069                 return TRUE;
2070         }
2071
2072         return FALSE;
2073 }
2074
2075 /*
2076  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2077  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2078  */
2079 boolean_t
2080 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2081 {
2082         if (not_in_kdp) {
2083                 panic("panic: rw lock exclusive check done outside of kernel debugger");
2084         }
2085         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2086 }
2087
2088 /*
2089  * Slow path routines for lck_mtx locking and unlocking functions.
2090  *
2091  * These functions were previously implemented in x86 assembly,
2092  * and some optimizations are in place in this c code to obtain a compiled code
2093  * as performant and compact as the assembly version.
2094  *
2095  * To avoid to inline these functions on the fast path, all functions directly called by
2096  * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2097  * in such a way the fast path can tail call into them. In this way the return address
2098  * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2099  *
2100  * Slow path code is structured in such a way there are no calls to functions that will return
2101  * on the context of the caller function, i.e. all functions called are or tail call functions
2102  * or inline functions. The number of arguments of the tail call functions are less then six,
2103  * so that they can be passed over registers and do not need to be pushed on stack.
2104  * This allows the compiler to not create a stack frame for the functions.
2105  *
2106  * __improbable and __probable are used to compile the slow path code in such a way
2107  * the fast path case will be on a sequence of instructions with as less jumps as possible,
2108  * to make this case the most optimized even if falling through the slow path.
2109  */
2110
2111 /*
2112  * Intel lock invariants:
2113  *
2114  * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2115  * lck_mtx_pri: contains the max priority of all waiters during a contention period
2116  *      not cleared on last unlock, but stomped over on next first contention
2117  * lck_mtx_promoted: set when the current lock owner has been promoted
2118  *      cleared when lock owner unlocks, set on acquire or wait.
2119  *
2120  * The lock owner is promoted to the max priority of all its waiters only if it
2121  * was a lower priority when it acquired or was an owner when a waiter waited.
2122  * Max priority is capped at MAXPRI_PROMOTE.
2123  *
2124  * The last waiter will not be promoted as it is woken up, but the last
2125  * lock owner may not have been the last thread to have been woken up depending on the
2126  * luck of the draw.  Therefore a last-owner may still have the promoted-on-wakeup
2127  * flag set.
2128  *
2129  * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2130  *       priority from dropping priority in the future without having to take thread lock
2131  *       on acquire.
2132  */
2133
2134 #ifdef  MUTEX_ZONE
2135 extern zone_t lck_mtx_zone;
2136 #endif
2137
2138 /*
2139  *      Routine:        lck_mtx_alloc_init
2140  */
2141 lck_mtx_t *
2142 lck_mtx_alloc_init(
2143         lck_grp_t       *grp,
2144         lck_attr_t      *attr)
2145 {
2146         lck_mtx_t       *lck;
2147 #ifdef  MUTEX_ZONE
2148         if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) {
2149                 lck_mtx_init(lck, grp, attr);
2150         }
2151 #else
2152         if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) {
2153                 lck_mtx_init(lck, grp, attr);
2154         }
2155 #endif
2156         return lck;
2157 }
2158
2159 /*
2160  *      Routine:        lck_mtx_free
2161  */
2162 void
2163 lck_mtx_free(
2164         lck_mtx_t       *lck,
2165         lck_grp_t       *grp)
2166 {
2167         lck_mtx_destroy(lck, grp);
2168 #ifdef  MUTEX_ZONE
2169         zfree(lck_mtx_zone, lck);
2170 #else
2171         kfree(lck, sizeof(lck_mtx_t));
2172 #endif
2173 }
2174
2175 /*
2176  *      Routine:        lck_mtx_ext_init
2177  */
2178 static void
2179 lck_mtx_ext_init(
2180         lck_mtx_ext_t   *lck,
2181         lck_grp_t       *grp,
2182         lck_attr_t      *attr)
2183 {
2184         bzero((void *)lck, sizeof(lck_mtx_ext_t));
2185
2186         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2187                 lck->lck_mtx_deb.type = MUTEX_TAG;
2188                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2189         }
2190
2191         lck->lck_mtx_grp = grp;
2192
2193         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
2194                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2195         }
2196
2197         lck->lck_mtx.lck_mtx_is_ext = 1;
2198         lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
2199 }
2200
2201 /*
2202  *      Routine:        lck_mtx_init
2203  */
2204 void
2205 lck_mtx_init(
2206         lck_mtx_t       *lck,
2207         lck_grp_t       *grp,
2208         lck_attr_t      *attr)
2209 {
2210         lck_mtx_ext_t   *lck_ext;
2211         lck_attr_t      *lck_attr;
2212
2213         if (attr != LCK_ATTR_NULL) {
2214                 lck_attr = attr;
2215         } else {
2216                 lck_attr = &LockDefaultLckAttr;
2217         }
2218
2219         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2220                 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2221                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
2222                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2223                         lck->lck_mtx_ptr = lck_ext;
2224                 }
2225         } else {
2226                 lck->lck_mtx_owner = 0;
2227                 lck->lck_mtx_state = 0;
2228         }
2229         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2230         lck_grp_reference(grp);
2231         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2232 }
2233
2234 /*
2235  *      Routine:        lck_mtx_init_ext
2236  */
2237 void
2238 lck_mtx_init_ext(
2239         lck_mtx_t       *lck,
2240         lck_mtx_ext_t   *lck_ext,
2241         lck_grp_t       *grp,
2242         lck_attr_t      *attr)
2243 {
2244         lck_attr_t      *lck_attr;
2245
2246         if (attr != LCK_ATTR_NULL) {
2247                 lck_attr = attr;
2248         } else {
2249                 lck_attr = &LockDefaultLckAttr;
2250         }
2251
2252         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2253                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2254                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2255                 lck->lck_mtx_ptr = lck_ext;
2256         } else {
2257                 lck->lck_mtx_owner = 0;
2258                 lck->lck_mtx_state = 0;
2259         }
2260         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2261
2262         lck_grp_reference(grp);
2263         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2264 }
2265
2266 static void
2267 lck_mtx_lock_mark_destroyed(
2268         lck_mtx_t *mutex,
2269         boolean_t indirect)
2270 {
2271         uint32_t state;
2272
2273         if (indirect) {
2274                 /* convert to destroyed state */
2275                 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2276                 return;
2277         }
2278
2279         state = ordered_load_mtx_state(mutex);
2280         lck_mtx_interlock_lock(mutex, &state);
2281
2282         ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2283
2284         enable_preemption();
2285 }
2286
2287 /*
2288  *      Routine:        lck_mtx_destroy
2289  */
2290 void
2291 lck_mtx_destroy(
2292         lck_mtx_t       *lck,
2293         lck_grp_t       *grp)
2294 {
2295         boolean_t indirect;
2296
2297         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2298                 return;
2299         }
2300 #if MACH_LDEBUG
2301         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2302 #endif
2303         indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2304
2305         lck_mtx_lock_mark_destroyed(lck, indirect);
2306
2307         if (indirect) {
2308                 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2309         }
2310         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2311         lck_grp_deallocate(grp);
2312         return;
2313 }
2314
2315
2316 #if DEVELOPMENT | DEBUG
2317 __attribute__((noinline))
2318 void
2319 lck_mtx_owner_check_panic(
2320         lck_mtx_t       *lock)
2321 {
2322         thread_t owner = (thread_t)lock->lck_mtx_owner;
2323         panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2324 }
2325 #endif
2326
2327 __attribute__((always_inline))
2328 static boolean_t
2329 get_indirect_mutex(
2330         lck_mtx_t       **lock,
2331         uint32_t        *state)
2332 {
2333         *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2334         *state = ordered_load_mtx_state(*lock);
2335         return TRUE;
2336 }
2337
2338 /*
2339  * Routine:     lck_mtx_unlock_slow
2340  *
2341  * Unlocks a mutex held by current thread.
2342  *
2343  * It will wake up waiters if necessary and
2344  * drop promotions.
2345  *
2346  * Interlock can be held.
2347  */
2348 __attribute__((noinline))
2349 void
2350 lck_mtx_unlock_slow(
2351         lck_mtx_t       *lock)
2352 {
2353         thread_t        thread;
2354         uint32_t        state, prev;
2355         boolean_t       indirect = FALSE;
2356
2357         state = ordered_load_mtx_state(lock);
2358
2359         /* Is this an indirect mutex? */
2360         if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2361                 indirect = get_indirect_mutex(&lock, &state);
2362         }
2363
2364         thread = current_thread();
2365
2366 #if DEVELOPMENT | DEBUG
2367         thread_t owner = (thread_t)lock->lck_mtx_owner;
2368         if (__improbable(owner != thread)) {
2369                 return lck_mtx_owner_check_panic(lock);
2370         }
2371 #endif
2372
2373         /* check if it is held as a spinlock */
2374         if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
2375                 goto unlock;
2376         }
2377
2378         lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2379
2380 unlock:
2381         /* preemption disabled, interlock held and mutex not held */
2382
2383         /* clear owner */
2384         ordered_store_mtx_owner(lock, 0);
2385         /* keep original state in prev for later evaluation */
2386         prev = state;
2387         /* release interlock, promotion and clear spin flag */
2388         state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK | LCK_MTX_PROMOTED_MSK));
2389         if ((state & LCK_MTX_WAITERS_MSK)) {
2390                 state -= LCK_MTX_WAITER;        /* decrement waiter count */
2391         }
2392         ordered_store_mtx_state_release(lock, state);           /* since I own the interlock, I don't need an atomic update */
2393
2394 #if     MACH_LDEBUG
2395         /* perform lock statistics after drop to prevent delay */
2396         if (thread) {
2397                 thread->mutex_count--;          /* lock statistic */
2398         }
2399 #endif  /* MACH_LDEBUG */
2400
2401         /* check if there are waiters to wake up or priority to drop */
2402         if ((prev & (LCK_MTX_PROMOTED_MSK | LCK_MTX_WAITERS_MSK))) {
2403                 return lck_mtx_unlock_wakeup_tail(lock, prev, indirect);
2404         }
2405
2406         /* re-enable preemption */
2407         lck_mtx_unlock_finish_inline(lock, FALSE);
2408
2409         return;
2410 }
2411
2412 #define LCK_MTX_LCK_WAIT_CODE           0x20
2413 #define LCK_MTX_LCK_WAKEUP_CODE         0x21
2414 #define LCK_MTX_LCK_SPIN_CODE           0x22
2415 #define LCK_MTX_LCK_ACQUIRE_CODE        0x23
2416 #define LCK_MTX_LCK_DEMOTE_CODE         0x24
2417
2418 /*
2419  * Routine:    lck_mtx_unlock_wakeup_tail
2420  *
2421  * Invoked on unlock when there is
2422  * contention, i.e. the assembly routine sees
2423  * that mutex->lck_mtx_waiters != 0 or
2424  * that mutex->lck_mtx_promoted != 0
2425  *
2426  * neither the mutex or interlock is held
2427  *
2428  * Note that this routine might not be called if there are pending
2429  * waiters which have previously been woken up, and they didn't
2430  * end up boosting the old owner.
2431  *
2432  * assembly routine previously did the following to mutex:
2433  * (after saving the state in prior_lock_state)
2434  *      cleared lck_mtx_promoted
2435  *      decremented lck_mtx_waiters if nonzero
2436  *
2437  * This function needs to be called as a tail call
2438  * to optimize the compiled code.
2439  */
2440 __attribute__((noinline))
2441 static void
2442 lck_mtx_unlock_wakeup_tail(
2443         lck_mtx_t       *mutex,
2444         int             prior_lock_state,
2445         boolean_t       indirect)
2446 {
2447         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2448         lck_mtx_t               fake_lck;
2449
2450         /*
2451          * prior_lock state is a snapshot of the 2nd word of the
2452          * lock in question... we'll fake up a lock with the bits
2453          * copied into place and carefully not access anything
2454          * beyond whats defined in the second word of a lck_mtx_t
2455          */
2456         fake_lck.lck_mtx_state = prior_lock_state;
2457
2458         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
2459             trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
2460
2461         if (__probable(fake_lck.lck_mtx_waiters)) {
2462                 kern_return_t did_wake;
2463
2464                 if (fake_lck.lck_mtx_waiters > 1) {
2465                         did_wake = thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
2466                 } else {
2467                         did_wake = thread_wakeup_one(LCK_MTX_EVENT(mutex));
2468                 }
2469                 /*
2470                  * The waiters count always precisely matches the number of threads on the waitqueue.
2471                  * i.e. we should never see ret == KERN_NOT_WAITING.
2472                  */
2473                 assert(did_wake == KERN_SUCCESS);
2474         }
2475
2476         /* When lck_mtx_promoted was set, then I as the owner definitely have a promotion */
2477         if (__improbable(fake_lck.lck_mtx_promoted)) {
2478                 thread_t thread = current_thread();
2479
2480                 spl_t s = splsched();
2481                 thread_lock(thread);
2482
2483                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
2484                     thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
2485                 assert(thread->was_promoted_on_wakeup == 0);
2486                 assert(thread->promotions > 0);
2487
2488                 assert_promotions_invariant(thread);
2489
2490                 if (--thread->promotions == 0) {
2491                         sched_thread_unpromote(thread, trace_lck);
2492                 }
2493
2494                 assert_promotions_invariant(thread);
2495
2496                 thread_unlock(thread);
2497                 splx(s);
2498         }
2499
2500         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
2501             trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2502
2503         lck_mtx_unlock_finish_inline(mutex, indirect);
2504 }
2505
2506 /*
2507  * Routine:     lck_mtx_lock_acquire_x86
2508  *
2509  * Invoked on acquiring the mutex when there is
2510  * contention (i.e. the assembly routine sees that
2511  * that mutex->lck_mtx_waiters != 0 or
2512  * thread->was_promoted_on_wakeup != 0)...
2513  *
2514  * mutex is owned...  interlock is held... preemption is disabled
2515  */
2516 __attribute__((always_inline))
2517 static void
2518 lck_mtx_lock_acquire_inline(
2519         lck_mtx_t       *mutex)
2520 {
2521         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2522         integer_t               priority;
2523
2524         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
2525             trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2526
2527         if (mutex->lck_mtx_waiters) {
2528                 priority = mutex->lck_mtx_pri;
2529         } else {
2530                 priority = 0; /* not worth resetting lck_mtx_pri here, it will be reset by next waiter */
2531         }
2532         /* the priority must have been set correctly by wait */
2533         assert(priority <= MAXPRI_PROMOTE);
2534         assert(priority == 0 || priority >= BASEPRI_DEFAULT);
2535
2536         /* if the mutex wasn't owned, then the owner wasn't promoted */
2537         assert(mutex->lck_mtx_promoted == 0);
2538
2539         thread_t thread = (thread_t)mutex->lck_mtx_owner;       /* faster than current_thread() */
2540
2541         if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
2542                 spl_t s = splsched();
2543                 thread_lock(thread);
2544
2545                 if (thread->was_promoted_on_wakeup) {
2546                         assert(thread->promotions > 0);
2547                 }
2548
2549                 /* Intel only promotes if priority goes up */
2550                 if (thread->sched_pri < priority && thread->promotion_priority < priority) {
2551                         /* Remember that I need to drop this promotion on unlock */
2552                         mutex->lck_mtx_promoted = 1;
2553
2554                         if (thread->promotions++ == 0) {
2555                                 /* This is the first promotion for the owner */
2556                                 sched_thread_promote_to_pri(thread, priority, trace_lck);
2557                         } else {
2558                                 /*
2559                                  * Holder was previously promoted due to a different mutex,
2560                                  * raise to match this one.
2561                                  * Or, this thread was promoted on wakeup but someone else
2562                                  * later contended on mutex at higher priority before we got here
2563                                  */
2564                                 sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
2565                         }
2566                 }
2567
2568                 if (thread->was_promoted_on_wakeup) {
2569                         thread->was_promoted_on_wakeup = 0;
2570                         if (--thread->promotions == 0) {
2571                                 sched_thread_unpromote(thread, trace_lck);
2572                         }
2573                 }
2574
2575                 thread_unlock(thread);
2576                 splx(s);
2577         }
2578         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
2579             trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2580 }
2581
2582 void
2583 lck_mtx_lock_acquire_x86(
2584         lck_mtx_t       *mutex)
2585 {
2586         return lck_mtx_lock_acquire_inline(mutex);
2587 }
2588
2589 /*
2590  * Tail call helpers for lock functions that perform
2591  * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2592  * the caller's compiled code.
2593  */
2594
2595 __attribute__((noinline))
2596 static void
2597 lck_mtx_lock_acquire_tail(
2598         lck_mtx_t       *mutex,
2599         boolean_t       indirect)
2600 {
2601         lck_mtx_lock_acquire_inline(mutex);
2602         lck_mtx_lock_finish_inline(mutex, ordered_load_mtx_state(mutex), indirect);
2603 }
2604
2605 __attribute__((noinline))
2606 static boolean_t
2607 lck_mtx_try_lock_acquire_tail(
2608         lck_mtx_t       *mutex)
2609 {
2610         lck_mtx_lock_acquire_inline(mutex);
2611         lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2612
2613         return TRUE;
2614 }
2615
2616 __attribute__((noinline))
2617 static void
2618 lck_mtx_convert_spin_acquire_tail(
2619         lck_mtx_t       *mutex)
2620 {
2621         lck_mtx_lock_acquire_inline(mutex);
2622         lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2623 }
2624
2625 boolean_t
2626 lck_mtx_ilk_unlock(
2627         lck_mtx_t       *mutex)
2628 {
2629         lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2630         return TRUE;
2631 }
2632
2633 static inline void
2634 lck_mtx_interlock_lock_set_and_clear_flags(
2635         lck_mtx_t *mutex,
2636         uint32_t xor_flags,
2637         uint32_t and_flags,
2638         uint32_t *new_state)
2639 {
2640         uint32_t state, prev;
2641         state = *new_state;
2642
2643         for (;;) {
2644                 /* have to wait for interlock to clear */
2645                 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2646                         cpu_pause();
2647                         state = ordered_load_mtx_state(mutex);
2648                 }
2649                 prev = state;                                   /* prev contains snapshot for exchange */
2650                 state |= LCK_MTX_ILOCKED_MSK | xor_flags;       /* pick up interlock */
2651                 state &= ~and_flags;                            /* clear flags */
2652
2653                 disable_preemption();
2654                 if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
2655                         break;
2656                 }
2657                 enable_preemption();
2658                 cpu_pause();
2659                 state = ordered_load_mtx_state(mutex);
2660         }
2661         *new_state = state;
2662         return;
2663 }
2664
2665 static inline void
2666 lck_mtx_interlock_lock_clear_flags(
2667         lck_mtx_t *mutex,
2668         uint32_t and_flags,
2669         uint32_t *new_state)
2670 {
2671         return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2672 }
2673
2674 static inline void
2675 lck_mtx_interlock_lock(
2676         lck_mtx_t *mutex,
2677         uint32_t *new_state)
2678 {
2679         return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2680 }
2681
2682 static inline int
2683 lck_mtx_interlock_try_lock_set_flags(
2684         lck_mtx_t *mutex,
2685         uint32_t or_flags,
2686         uint32_t *new_state)
2687 {
2688         uint32_t state, prev;
2689         state = *new_state;
2690
2691         /* have to wait for interlock to clear */
2692         if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2693                 return 0;
2694         }
2695         prev = state;                                   /* prev contains snapshot for exchange */
2696         state |= LCK_MTX_ILOCKED_MSK | or_flags;        /* pick up interlock */
2697         disable_preemption();
2698         if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
2699                 *new_state = state;
2700                 return 1;
2701         }
2702
2703         enable_preemption();
2704         return 0;
2705 }
2706
2707 static inline int
2708 lck_mtx_interlock_try_lock(
2709         lck_mtx_t *mutex,
2710         uint32_t *new_state)
2711 {
2712         return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state);
2713 }
2714
2715 static inline int
2716 lck_mtx_interlock_try_lock_disable_interrupts(
2717         lck_mtx_t *mutex,
2718         boolean_t *istate)
2719 {
2720         uint32_t        state;
2721
2722         *istate = ml_set_interrupts_enabled(FALSE);
2723         state = ordered_load_mtx_state(mutex);
2724
2725         if (lck_mtx_interlock_try_lock(mutex, &state)) {
2726                 return 1;
2727         } else {
2728                 ml_set_interrupts_enabled(*istate);
2729                 return 0;
2730         }
2731 }
2732
2733 static inline void
2734 lck_mtx_interlock_unlock_enable_interrupts(
2735         lck_mtx_t *mutex,
2736         boolean_t istate)
2737 {
2738         lck_mtx_ilk_unlock(mutex);
2739         ml_set_interrupts_enabled(istate);
2740 }
2741
2742 __attribute__((noinline))
2743 static void
2744 lck_mtx_lock_contended(
2745         lck_mtx_t       *lock,
2746         boolean_t indirect,
2747         boolean_t *first_miss)
2748 {
2749         lck_mtx_spinwait_ret_type_t ret;
2750         uint32_t state;
2751         thread_t thread;
2752
2753 try_again:
2754
2755         if (indirect) {
2756                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
2757         }
2758
2759         ret = lck_mtx_lock_spinwait_x86(lock);
2760         state = ordered_load_mtx_state(lock);
2761         switch (ret) {
2762         case LCK_MTX_SPINWAIT_NO_SPIN:
2763                 /*
2764                  * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2765                  * try to spin.
2766                  */
2767                 if (indirect) {
2768                         lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
2769                 }
2770
2771         /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2772         case LCK_MTX_SPINWAIT_SPUN:
2773                 /*
2774                  * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2775                  * interlock not held
2776                  */
2777                 lck_mtx_interlock_lock(lock, &state);
2778                 assert(state & LCK_MTX_ILOCKED_MSK);
2779
2780                 if (state & LCK_MTX_MLOCKED_MSK) {
2781                         if (indirect) {
2782                                 lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
2783                         }
2784                         lck_mtx_lock_wait_x86(lock);
2785                         /*
2786                          * interlock is not held here.
2787                          */
2788                         goto try_again;
2789                 } else {
2790                         /* grab the mutex */
2791                         state |= LCK_MTX_MLOCKED_MSK;
2792                         ordered_store_mtx_state_release(lock, state);
2793                         thread = current_thread();
2794                         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2795 #if     MACH_LDEBUG
2796                         if (thread) {
2797                                 thread->mutex_count++;
2798                         }
2799 #endif  /* MACH_LDEBUG */
2800                 }
2801
2802                 break;
2803         case LCK_MTX_SPINWAIT_ACQUIRED:
2804                 /*
2805                  * mutex has been acquired by lck_mtx_lock_spinwait_x86
2806                  * interlock is held and preemption disabled
2807                  * owner is set and mutex marked as locked
2808                  * statistics updated too
2809                  */
2810                 break;
2811         default:
2812                 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2813         }
2814
2815         /*
2816          * interlock is already acquired here
2817          */
2818
2819         /* mutex has been acquired */
2820         thread = (thread_t)lock->lck_mtx_owner;
2821         if (state & LCK_MTX_WAITERS_MSK || thread->was_promoted_on_wakeup) {
2822                 return lck_mtx_lock_acquire_tail(lock, indirect);
2823         }
2824
2825         /* release the interlock */
2826         lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2827 }
2828
2829 /*
2830  * Helper noinline functions for calling
2831  * panic to optimize compiled code.
2832  */
2833
2834 __attribute__((noinline))
2835 static void
2836 lck_mtx_destroyed(
2837         lck_mtx_t       *lock)
2838 {
2839         panic("trying to interlock destroyed mutex (%p)", lock);
2840 }
2841
2842 __attribute__((noinline))
2843 static boolean_t
2844 lck_mtx_try_destroyed(
2845         lck_mtx_t       *lock)
2846 {
2847         panic("trying to interlock destroyed mutex (%p)", lock);
2848         return FALSE;
2849 }
2850
2851 __attribute__((always_inline))
2852 static boolean_t
2853 lck_mtx_lock_wait_interlock_to_clear(
2854         lck_mtx_t       *lock,
2855         uint32_t*        new_state)
2856 {
2857         uint32_t state;
2858
2859         for (;;) {
2860                 cpu_pause();
2861                 state = ordered_load_mtx_state(lock);
2862                 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2863                         *new_state = state;
2864                         return TRUE;
2865                 }
2866                 if (state & LCK_MTX_MLOCKED_MSK) {
2867                         /* if it is held as mutex, just fail */
2868                         return FALSE;
2869                 }
2870         }
2871 }
2872
2873 __attribute__((always_inline))
2874 static boolean_t
2875 lck_mtx_try_lock_wait_interlock_to_clear(
2876         lck_mtx_t       *lock,
2877         uint32_t*        new_state)
2878 {
2879         uint32_t state;
2880
2881         for (;;) {
2882                 cpu_pause();
2883                 state = ordered_load_mtx_state(lock);
2884                 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2885                         /* if it is held as mutex or spin, just fail */
2886                         return FALSE;
2887                 }
2888                 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2889                         *new_state = state;
2890                         return TRUE;
2891                 }
2892         }
2893 }
2894
2895 /*
2896  * Routine:     lck_mtx_lock_slow
2897  *
2898  * Locks a mutex for current thread.
2899  * If the lock is contended this function might
2900  * sleep.
2901  *
2902  * Called with interlock not held.
2903  */
2904 __attribute__((noinline))
2905 void
2906 lck_mtx_lock_slow(
2907         lck_mtx_t       *lock)
2908 {
2909         boolean_t       indirect = FALSE;
2910         uint32_t        state;
2911         int             first_miss = 0;
2912
2913         state = ordered_load_mtx_state(lock);
2914
2915         /* is the interlock or mutex held */
2916         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2917                 /*
2918                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2919                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2920                  * set in state (state == lck_mtx_tag)
2921                  */
2922
2923
2924                 /* is the mutex already held and not indirect */
2925                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
2926                         /* no, must have been the mutex */
2927                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2928                 }
2929
2930                 /* check to see if it is marked destroyed */
2931                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2932                         return lck_mtx_destroyed(lock);
2933                 }
2934
2935                 /* Is this an indirect mutex? */
2936                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2937                         indirect = get_indirect_mutex(&lock, &state);
2938
2939                         first_miss = 0;
2940                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2941
2942                         if (state & LCK_MTX_SPIN_MSK) {
2943                                 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2944                                 assert(state & LCK_MTX_ILOCKED_MSK);
2945                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2946                         }
2947                 }
2948
2949                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2950                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2951                 }
2952         }
2953
2954         /* no - can't be INDIRECT, DESTROYED or locked */
2955         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2956                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2957                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
2958                 }
2959         }
2960
2961         /* lock and interlock acquired */
2962
2963         thread_t thread = current_thread();
2964         /* record owner of mutex */
2965         ordered_store_mtx_owner(lock, (uintptr_t)thread);
2966
2967 #if MACH_LDEBUG
2968         if (thread) {
2969                 thread->mutex_count++;          /* lock statistic */
2970         }
2971 #endif
2972         /*
2973          * Check if there are waiters to
2974          * inherit their priority.
2975          */
2976         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2977                 return lck_mtx_lock_acquire_tail(lock, indirect);
2978         }
2979
2980         /* release the interlock */
2981         lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2982
2983         return;
2984 }
2985
2986 __attribute__((noinline))
2987 boolean_t
2988 lck_mtx_try_lock_slow(
2989         lck_mtx_t       *lock)
2990 {
2991         boolean_t       indirect = FALSE;
2992         uint32_t        state;
2993         int             first_miss = 0;
2994
2995         state = ordered_load_mtx_state(lock);
2996
2997         /* is the interlock or mutex held */
2998         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2999                 /*
3000                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3001                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3002                  * set in state (state == lck_mtx_tag)
3003                  */
3004
3005                 /* is the mutex already held and not indirect */
3006                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3007                         return FALSE;
3008                 }
3009
3010                 /* check to see if it is marked destroyed */
3011                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3012                         return lck_mtx_try_destroyed(lock);
3013                 }
3014
3015                 /* Is this an indirect mutex? */
3016                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3017                         indirect = get_indirect_mutex(&lock, &state);
3018
3019                         first_miss = 0;
3020                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3021                 }
3022
3023                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3024                         if (indirect) {
3025                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3026                         }
3027                         return FALSE;
3028                 }
3029         }
3030
3031         /* no - can't be INDIRECT, DESTROYED or locked */
3032         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
3033                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3034                         if (indirect) {
3035                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3036                         }
3037                         return FALSE;
3038                 }
3039         }
3040
3041         /* lock and interlock acquired */
3042
3043         thread_t thread = current_thread();
3044         /* record owner of mutex */
3045         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3046
3047 #if MACH_LDEBUG
3048         if (thread) {
3049                 thread->mutex_count++;          /* lock statistic */
3050         }
3051 #endif
3052         /*
3053          * Check if there are waiters to
3054          * inherit their priority.
3055          */
3056         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3057                 return lck_mtx_try_lock_acquire_tail(lock);
3058         }
3059
3060         /* release the interlock */
3061         lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
3062
3063         return TRUE;
3064 }
3065
3066 __attribute__((noinline))
3067 void
3068 lck_mtx_lock_spin_slow(
3069         lck_mtx_t       *lock)
3070 {
3071         boolean_t       indirect = FALSE;
3072         uint32_t        state;
3073         int             first_miss = 0;
3074
3075         state = ordered_load_mtx_state(lock);
3076
3077         /* is the interlock or mutex held */
3078         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3079                 /*
3080                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3081                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3082                  * set in state (state == lck_mtx_tag)
3083                  */
3084
3085
3086                 /* is the mutex already held and not indirect */
3087                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3088                         /* no, must have been the mutex */
3089                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3090                 }
3091
3092                 /* check to see if it is marked destroyed */
3093                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3094                         return lck_mtx_destroyed(lock);
3095                 }
3096
3097                 /* Is this an indirect mutex? */
3098                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3099                         indirect = get_indirect_mutex(&lock, &state);
3100
3101                         first_miss = 0;
3102                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3103
3104                         if (state & LCK_MTX_SPIN_MSK) {
3105                                 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3106                                 assert(state & LCK_MTX_ILOCKED_MSK);
3107                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3108                         }
3109                 }
3110
3111                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3112                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3113                 }
3114         }
3115
3116         /* no - can't be INDIRECT, DESTROYED or locked */
3117         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3118                 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3119                         return lck_mtx_lock_contended(lock, indirect, &first_miss);
3120                 }
3121         }
3122
3123         /* lock as spinlock and interlock acquired */
3124
3125         thread_t thread = current_thread();
3126         /* record owner of mutex */
3127         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3128
3129 #if MACH_LDEBUG
3130         if (thread) {
3131                 thread->mutex_count++;          /* lock statistic */
3132         }
3133 #endif
3134
3135 #if     CONFIG_DTRACE
3136         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3137 #endif
3138         /* return with the interlock held and preemption disabled */
3139         return;
3140 }
3141
3142 __attribute__((noinline))
3143 boolean_t
3144 lck_mtx_try_lock_spin_slow(
3145         lck_mtx_t       *lock)
3146 {
3147         boolean_t       indirect = FALSE;
3148         uint32_t        state;
3149         int             first_miss = 0;
3150
3151         state = ordered_load_mtx_state(lock);
3152
3153         /* is the interlock or mutex held */
3154         if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3155                 /*
3156                  * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3157                  * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3158                  * set in state (state == lck_mtx_tag)
3159                  */
3160
3161                 /* is the mutex already held and not indirect */
3162                 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3163                         return FALSE;
3164                 }
3165
3166                 /* check to see if it is marked destroyed */
3167                 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3168                         return lck_mtx_try_destroyed(lock);
3169                 }
3170
3171                 /* Is this an indirect mutex? */
3172                 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3173                         indirect = get_indirect_mutex(&lock, &state);
3174
3175                         first_miss = 0;
3176                         lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3177                 }
3178
3179                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3180                         if (indirect) {
3181                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3182                         }
3183                         return FALSE;
3184                 }
3185         }
3186
3187         /* no - can't be INDIRECT, DESTROYED or locked */
3188         while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3189                 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3190                         if (indirect) {
3191                                 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3192                         }
3193                         return FALSE;
3194                 }
3195         }
3196
3197         /* lock and interlock acquired */
3198
3199         thread_t thread = current_thread();
3200         /* record owner of mutex */
3201         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3202
3203 #if MACH_LDEBUG
3204         if (thread) {
3205                 thread->mutex_count++;          /* lock statistic */
3206         }
3207 #endif
3208
3209 #if     CONFIG_DTRACE
3210         LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3211 #endif
3212         return TRUE;
3213 }
3214
3215 __attribute__((noinline))
3216 void
3217 lck_mtx_convert_spin(
3218         lck_mtx_t       *lock)
3219 {
3220         uint32_t state;
3221
3222         state = ordered_load_mtx_state(lock);
3223
3224         /* Is this an indirect mutex? */
3225         if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3226                 /* If so, take indirection */
3227                 get_indirect_mutex(&lock, &state);
3228         }
3229
3230         assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3231
3232         if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3233                 /* already owned as a mutex, just return */
3234                 return;
3235         }
3236
3237         assert(get_preemption_level() > 0);
3238         assert(state & LCK_MTX_ILOCKED_MSK);
3239         assert(state & LCK_MTX_SPIN_MSK);
3240
3241         /*
3242          * Check if there are waiters to
3243          * inherit their priority.
3244          */
3245         if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3246                 return lck_mtx_convert_spin_acquire_tail(lock);
3247         }
3248
3249         lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3250
3251         return;
3252 }
3253
3254 static inline boolean_t
3255 lck_mtx_lock_grab_mutex(
3256         lck_mtx_t       *lock)
3257 {
3258         uint32_t state;
3259
3260         state = ordered_load_mtx_state(lock);
3261
3262         if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3263                 return FALSE;
3264         }
3265
3266         /* lock and interlock acquired */
3267
3268         thread_t thread = current_thread();
3269         /* record owner of mutex */
3270         ordered_store_mtx_owner(lock, (uintptr_t)thread);
3271
3272 #if MACH_LDEBUG
3273         if (thread) {
3274                 thread->mutex_count++;          /* lock statistic */
3275         }
3276 #endif
3277         return TRUE;
3278 }
3279
3280 __attribute__((noinline))
3281 void
3282 lck_mtx_assert(
3283         lck_mtx_t       *lock,
3284         unsigned int    type)
3285 {
3286         thread_t thread, owner;
3287         uint32_t state;
3288
3289         thread = current_thread();
3290         state = ordered_load_mtx_state(lock);
3291
3292         if (state == LCK_MTX_TAG_INDIRECT) {
3293                 get_indirect_mutex(&lock, &state);
3294         }
3295
3296         owner = (thread_t)lock->lck_mtx_owner;
3297
3298         if (type == LCK_MTX_ASSERT_OWNED) {
3299                 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
3300                         panic("mutex (%p) not owned\n", lock);
3301                 }
3302         } else {
3303                 assert(type == LCK_MTX_ASSERT_NOTOWNED);
3304                 if (owner == thread) {
3305                         panic("mutex (%p) owned\n", lock);
3306                 }
3307         }
3308 }
3309
3310 /*
3311  * Routine:     lck_mtx_lock_spinwait_x86
3312  *
3313  * Invoked trying to acquire a mutex when there is contention but
3314  * the holder is running on another processor. We spin for up to a maximum
3315  * time waiting for the lock to be released.
3316  *
3317  * Called with the interlock unlocked.
3318  * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3319  * returns LCK_MTX_SPINWAIT_SPUN if we spun
3320  * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3321  */
3322 __attribute__((noinline))
3323 lck_mtx_spinwait_ret_type_t
3324 lck_mtx_lock_spinwait_x86(
3325         lck_mtx_t       *mutex)
3326 {
3327         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3328         thread_t        holder;
3329         uint64_t        overall_deadline;
3330         uint64_t        check_owner_deadline;
3331         uint64_t        cur_time;
3332         lck_mtx_spinwait_ret_type_t             retval = LCK_MTX_SPINWAIT_SPUN;
3333         int             loopcount = 0;
3334
3335         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
3336             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
3337
3338         cur_time = mach_absolute_time();
3339         overall_deadline = cur_time + MutexSpin;
3340         check_owner_deadline = cur_time;
3341
3342         /*
3343          * Spin while:
3344          *   - mutex is locked, and
3345          *   - its locked as a spin lock, and
3346          *   - owner is running on another processor, and
3347          *   - owner (processor) is not idling, and
3348          *   - we haven't spun for long enough.
3349          */
3350         do {
3351                 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
3352                         retval = LCK_MTX_SPINWAIT_ACQUIRED;
3353                         break;
3354                 }
3355                 cur_time = mach_absolute_time();
3356
3357                 if (cur_time >= overall_deadline) {
3358                         break;
3359                 }
3360
3361                 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
3362                         boolean_t       istate;
3363
3364                         /*
3365                          * We will repeatedly peek at the state of the lock while spinning,
3366                          * and we will acquire the interlock to do so.
3367                          * The thread that will unlock the mutex will also need to acquire
3368                          * the interlock, and we want to avoid to slow it down.
3369                          * To avoid to get an interrupt while holding the interlock
3370                          * and increase the time we are holding it, we
3371                          * will try to acquire the interlock with interrupts disabled.
3372                          * This is safe because it is a "try_lock", if we can't acquire
3373                          * the interlock we re-enable the interrupts and fail, so it is
3374                          * ok to call it even if the interlock was already held.
3375                          */
3376                         if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) {
3377                                 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
3378                                         if (!(holder->machine.specFlags & OnProc) ||
3379                                             (holder->state & TH_IDLE)) {
3380                                                 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3381
3382                                                 if (loopcount == 0) {
3383                                                         retval = LCK_MTX_SPINWAIT_NO_SPIN;
3384                                                 }
3385                                                 break;
3386                                         }
3387                                 }
3388                                 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3389
3390                                 check_owner_deadline = cur_time + (MutexSpin / 4);
3391                         }
3392                 }
3393                 cpu_pause();
3394
3395                 loopcount++;
3396         } while (TRUE);
3397
3398 #if     CONFIG_DTRACE
3399         /*
3400          * We've already kept a count via overall_deadline of how long we spun.
3401          * If dtrace is active, then we compute backwards to decide how
3402          * long we spun.
3403          *
3404          * Note that we record a different probe id depending on whether
3405          * this is a direct or indirect mutex.  This allows us to
3406          * penalize only lock groups that have debug/stats enabled
3407          * with dtrace processing if desired.
3408          */
3409         if (__probable(mutex->lck_mtx_is_ext == 0)) {
3410                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3411                     mach_absolute_time() - (overall_deadline - MutexSpin));
3412         } else {
3413                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3414                     mach_absolute_time() - (overall_deadline - MutexSpin));
3415         }
3416         /* The lockstat acquire event is recorded by the assembly code beneath us. */
3417 #endif
3418
3419         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
3420             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
3421
3422         return retval;
3423 }
3424
3425
3426
3427 /*
3428  * Routine:     lck_mtx_lock_wait_x86
3429  *
3430  * Invoked in order to wait on contention.
3431  *
3432  * Called with the interlock locked and
3433  * preemption disabled...
3434  * returns it unlocked and with preemption enabled
3435  *
3436  * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3437  *      A runnable waiter can exist between wait and acquire
3438  *      without a waiters count being set.
3439  *      This allows us to never make a spurious wakeup call.
3440  *
3441  * Priority:
3442  *      This avoids taking the thread lock if the owning thread is the same priority.
3443  *      This optimizes the case of same-priority threads contending on a lock.
3444  *      However, that allows the owning thread to drop in priority while holding the lock,
3445  *      because there is no state that the priority change can notice that
3446  *      says that the targeted thread holds a contended mutex.
3447  *
3448  *      One possible solution: priority changes could look for some atomic tag
3449  *      on the thread saying 'holding contended lock', and then set up a promotion.
3450  *      Needs a story for dropping that promotion - the last contended unlock
3451  *      has to notice that this has happened.
3452  */
3453 __attribute__((noinline))
3454 void
3455 lck_mtx_lock_wait_x86(
3456         lck_mtx_t       *mutex)
3457 {
3458 #if     CONFIG_DTRACE
3459         uint64_t sleep_start = 0;
3460
3461         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3462                 sleep_start = mach_absolute_time();
3463         }
3464 #endif
3465         thread_t self = current_thread();
3466         assert(self->waiting_for_mutex == NULL);
3467
3468         self->waiting_for_mutex = mutex;
3469
3470         __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3471
3472         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
3473             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3474             mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
3475
3476         integer_t waiter_pri = self->sched_pri;
3477         waiter_pri = MAX(waiter_pri, self->base_pri);
3478         waiter_pri = MAX(waiter_pri, BASEPRI_DEFAULT);
3479         waiter_pri = MIN(waiter_pri, MAXPRI_PROMOTE);
3480
3481         assert(mutex->lck_mtx_pri <= MAXPRI_PROMOTE);
3482
3483         /* Re-initialize lck_mtx_pri if this is the first contention */
3484         if (mutex->lck_mtx_waiters == 0 || mutex->lck_mtx_pri <= waiter_pri) {
3485                 mutex->lck_mtx_pri = waiter_pri;
3486         }
3487
3488         thread_t holder = (thread_t)mutex->lck_mtx_owner;
3489
3490         assert(holder != NULL);
3491
3492         /*
3493          * Intel only causes a promotion when priority needs to change,
3494          * reducing thread lock holds but leaving us vulnerable to the holder
3495          * dropping priority.
3496          */
3497         if (holder->sched_pri < mutex->lck_mtx_pri) {
3498                 int promote_pri = mutex->lck_mtx_pri;
3499
3500                 spl_t s = splsched();
3501                 thread_lock(holder);
3502
3503                 /* Check again in case sched_pri changed */
3504                 if (holder->sched_pri < promote_pri && holder->promotion_priority < promote_pri) {
3505                         if (mutex->lck_mtx_promoted == 0) {
3506                                 /* This is the first promotion for this mutex */
3507                                 mutex->lck_mtx_promoted = 1;
3508
3509                                 if (holder->promotions++ == 0) {
3510                                         /* This is the first promotion for holder */
3511                                         sched_thread_promote_to_pri(holder, promote_pri, trace_lck);
3512                                 } else {
3513                                         /*
3514                                          * Holder was previously promoted due to a different mutex,
3515                                          * check if it needs to raise to match this one
3516                                          */
3517                                         sched_thread_update_promotion_to_pri(holder, promote_pri,
3518                                             trace_lck);
3519                                 }
3520                         } else {
3521                                 /*
3522                                  * Holder was previously promoted due to this mutex,
3523                                  * check if the pri needs to go up
3524                                  */
3525                                 sched_thread_update_promotion_to_pri(holder, promote_pri, trace_lck);
3526                         }
3527                 }
3528
3529                 thread_unlock(holder);
3530                 splx(s);
3531         }
3532
3533         mutex->lck_mtx_waiters++;
3534
3535         thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3536         assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
3537
3538         lck_mtx_ilk_unlock(mutex);
3539
3540         thread_block(THREAD_CONTINUE_NULL);
3541
3542         self->waiting_for_mutex = NULL;
3543
3544         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
3545             trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3546             mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
3547
3548 #if     CONFIG_DTRACE
3549         /*
3550          * Record the Dtrace lockstat probe for blocking, block time
3551          * measured from when we were entered.
3552          */
3553         if (sleep_start) {
3554                 if (mutex->lck_mtx_is_ext == 0) {
3555                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3556                             mach_absolute_time() - sleep_start);
3557                 } else {
3558                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3559                             mach_absolute_time() - sleep_start);
3560                 }
3561         }
3562 #endif
3563 }
3564
3565 /*
3566  *      Routine: kdp_lck_mtx_lock_spin_is_acquired
3567  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3568  *      Returns: TRUE if lock is acquired.
3569  */
3570 boolean_t
3571 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t     *lck)
3572 {
3573         if (not_in_kdp) {
3574                 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3575         }
3576
3577         if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3578                 return TRUE;
3579         }
3580
3581         return FALSE;
3582 }
3583
3584 void
3585 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3586 {
3587         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3588         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3589         thread_t holder   = (thread_t)mutex->lck_mtx_owner;
3590         waitinfo->owner   = thread_tid(holder);
3591 }
3592
3593 void
3594 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3595 {
3596         lck_rw_t *rwlck = NULL;
3597         switch (waitinfo->wait_type) {
3598         case kThreadWaitKernelRWLockRead:
3599                 rwlck = READ_EVENT_TO_RWLOCK(event);
3600                 break;
3601         case kThreadWaitKernelRWLockWrite:
3602         case kThreadWaitKernelRWLockUpgrade:
3603                 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3604                 break;
3605         default:
3606                 panic("%s was called with an invalid blocking type", __FUNCTION__);
3607                 break;
3608         }
3609         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3610         waitinfo->owner = 0;
3611 }