osfmk/i386/locks_i386.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  *      File:   kern/lock.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *      Date:   1985
  60  *
  61  *      Locking primitives implementation
  62  */
  63
  64 #include <mach_ldebug.h>
  65
  66 #include <kern/locks.h>
  67 #include <kern/kalloc.h>
  68 #include <kern/misc_protos.h>
  69 #include <kern/thread.h>
  70 #include <kern/processor.h>
  71 #include <kern/cpu_data.h>
  72 #include <kern/cpu_number.h>
  73 #include <kern/sched_prim.h>
  74 #include <kern/xpr.h>
  75 #include <kern/debug.h>
  76 #include <string.h>
  77
  78 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
  79 #include <machine/atomic.h>
  80 #include <machine/machine_cpu.h>
  81 #include <i386/mp.h>
  82
  83 #include <sys/kdebug.h>
  84 #include <mach/branch_predicates.h>
  85
  86 /*
  87  * We need only enough declarations from the BSD-side to be able to
  88  * test if our probe is active, and to call __dtrace_probe().  Setting
  89  * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
  90  */
  91 #if     CONFIG_DTRACE
  92 #define NEED_DTRACE_DEFS
  93 #include <../bsd/sys/lockstat.h>
  94
  95 #define DTRACE_RW_SHARED        0x0     //reader
  96 #define DTRACE_RW_EXCL          0x1     //writer
  97 #define DTRACE_NO_FLAG          0x0     //not applicable
  98
  99 #endif
 100
 101 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
 102 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
 103 #define LCK_RW_LCK_SHARED_CODE          0x102
 104 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
 105 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
 106 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 107
 108 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
 109 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
 110 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
 111 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
 112 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
 113 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
 114 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
 115 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
 116
 117
 118 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 119
 120 unsigned int LcksOpts=0;
 121
 122 #if DEVELOPMENT || DEBUG
 123 unsigned int LckDisablePreemptCheck = 0;
 124 #endif
 125
 126 /* Forwards */
 127
 128 #if     USLOCK_DEBUG
 129 /*
 130  *      Perform simple lock checks.
 131  */
 132 int     uslock_check = 1;
 133 int     max_lock_loops  = 100000000;
 134 decl_simple_lock_data(extern , printf_lock)
 135 decl_simple_lock_data(extern , panic_lock)
 136 #endif  /* USLOCK_DEBUG */
 137
 138 extern unsigned int not_in_kdp;
 139
 140 /*
 141  *      We often want to know the addresses of the callers
 142  *      of the various lock routines.  However, this information
 143  *      is only used for debugging and statistics.
 144  */
 145 typedef void    *pc_t;
 146 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 147 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 148 #if     ANY_LOCK_DEBUG
 149 #define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
 150 #define DECL_PC(pc)     pc_t pc;
 151 #else   /* ANY_LOCK_DEBUG */
 152 #define DECL_PC(pc)
 153 #ifdef  lint
 154 /*
 155  *      Eliminate lint complaints about unused local pc variables.
 156  */
 157 #define OBTAIN_PC(pc)   ++pc
 158 #else   /* lint */
 159 #define OBTAIN_PC(pc)
 160 #endif  /* lint */
 161 #endif  /* USLOCK_DEBUG */
 162
 163 // Enforce program order of loads and stores.
 164 #define ordered_load(target) _Generic( (target),\
 165                 uint32_t* : __c11_atomic_load((_Atomic uint32_t* )(target), memory_order_relaxed), \
 166                 uintptr_t*: __c11_atomic_load((_Atomic uintptr_t*)(target), memory_order_relaxed) )
 167 #define ordered_store(target, value) _Generic( (target),\
 168                 uint32_t* : __c11_atomic_store((_Atomic uint32_t* )(target), (value), memory_order_relaxed), \
 169                 uintptr_t*: __c11_atomic_store((_Atomic uintptr_t*)(target), (value), memory_order_relaxed) )
 170
 171 /*
 172  * atomic exchange API is a low level abstraction of the operations
 173  * to atomically read, modify, and write a pointer.  This abstraction works
 174  * for both Intel and ARMv8.1 compare and exchange atomic instructions as
 175  * well as the ARM exclusive instructions.
 176  *
 177  * atomic_exchange_begin() - begin exchange and retrieve current value
 178  * atomic_exchange_complete() - conclude an exchange
 179  * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
 180  */
 181 static uint32_t
 182 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
 183 {
 184         uint32_t        val;
 185
 186         (void)ord;                      // Memory order not used
 187         val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed);
 188         *previous = val;
 189         return val;
 190 }
 191
 192 static boolean_t
 193 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
 194 {
 195         return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
 196 }
 197
 198 static void
 199 atomic_exchange_abort(void) { }
 200
 201 static boolean_t
 202 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
 203 {
 204         uint32_t        value, prev;
 205
 206         for ( ; ; ) {
 207                 value = atomic_exchange_begin32(target, &prev, ord);
 208                 if (value & test_mask) {
 209                         if (wait)
 210                                 cpu_pause();
 211                         else
 212                                 atomic_exchange_abort();
 213                         return FALSE;
 214                 }
 215                 value |= set_mask;
 216                 if (atomic_exchange_complete32(target, prev, value, ord))
 217                         return TRUE;
 218         }
 219 }
 220
 221 /*
 222  *      Portable lock package implementation of usimple_locks.
 223  */
 224
 225 #if     USLOCK_DEBUG
 226 #define USLDBG(stmt)    stmt
 227 void            usld_lock_init(usimple_lock_t, unsigned short);
 228 void            usld_lock_pre(usimple_lock_t, pc_t);
 229 void            usld_lock_post(usimple_lock_t, pc_t);
 230 void            usld_unlock(usimple_lock_t, pc_t);
 231 void            usld_lock_try_pre(usimple_lock_t, pc_t);
 232 void            usld_lock_try_post(usimple_lock_t, pc_t);
 233 int             usld_lock_common_checks(usimple_lock_t, char *);
 234 #else   /* USLOCK_DEBUG */
 235 #define USLDBG(stmt)
 236 #endif  /* USLOCK_DEBUG */
 237
 238
 239 /*
 240  * Forward definitions
 241  */
 242
 243 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
 244 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
 245 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 246 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 247 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 248 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 249 void lck_rw_clear_promotions_x86(thread_t thread);
 250 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
 251 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
 252 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
 253
 254 /*
 255  *      Routine:        lck_spin_alloc_init
 256  */
 257 lck_spin_t *
 258 lck_spin_alloc_init(
 259         lck_grp_t       *grp,
 260         lck_attr_t      *attr)
 261 {
 262         lck_spin_t      *lck;
 263
 264         if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
 265                 lck_spin_init(lck, grp, attr);
 266
 267         return(lck);
 268 }
 269
 270 /*
 271  *      Routine:        lck_spin_free
 272  */
 273 void
 274 lck_spin_free(
 275         lck_spin_t      *lck,
 276         lck_grp_t       *grp)
 277 {
 278         lck_spin_destroy(lck, grp);
 279         kfree(lck, sizeof(lck_spin_t));
 280 }
 281
 282 /*
 283  *      Routine:        lck_spin_init
 284  */
 285 void
 286 lck_spin_init(
 287         lck_spin_t      *lck,
 288         lck_grp_t       *grp,
 289         __unused lck_attr_t     *attr)
 290 {
 291         usimple_lock_init((usimple_lock_t) lck, 0);
 292         lck_grp_reference(grp);
 293         lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 294 }
 295
 296 /*
 297  *      Routine:        lck_spin_destroy
 298  */
 299 void
 300 lck_spin_destroy(
 301         lck_spin_t      *lck,
 302         lck_grp_t       *grp)
 303 {
 304         if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
 305                 return;
 306         lck->interlock = LCK_SPIN_TAG_DESTROYED;
 307         lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 308         lck_grp_deallocate(grp);
 309         return;
 310 }
 311
 312 /*
 313  *      Routine:        lck_spin_lock
 314  */
 315 void
 316 lck_spin_lock(
 317         lck_spin_t      *lck)
 318 {
 319         usimple_lock((usimple_lock_t) lck);
 320 }
 321
 322 /*
 323  *      Routine:        lck_spin_unlock
 324  */
 325 void
 326 lck_spin_unlock(
 327         lck_spin_t      *lck)
 328 {
 329         usimple_unlock((usimple_lock_t) lck);
 330 }
 331
 332
 333 /*
 334  *      Routine:        lck_spin_try_lock
 335  */
 336 boolean_t
 337 lck_spin_try_lock(
 338         lck_spin_t      *lck)
 339 {
 340         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
 341 #if     DEVELOPMENT || DEBUG
 342         if (lrval) {
 343                 pltrace(FALSE);
 344         }
 345 #endif
 346         return(lrval);
 347 }
 348
 349 /*
 350  *      Routine:        lck_spin_assert
 351  */
 352 void
 353 lck_spin_assert(lck_spin_t *lock, unsigned int type)
 354 {
 355         thread_t thread, holder;
 356         uintptr_t state;
 357
 358         if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
 359                 panic("lck_spin_assert(): invalid arg (%u)", type);
 360         }
 361
 362         state = lock->interlock;
 363         holder = (thread_t)state;
 364         thread = current_thread();
 365         if (type == LCK_ASSERT_OWNED) {
 366                 if (__improbable(holder == THREAD_NULL)) {
 367                         panic("Lock not owned %p = %lx", lock, state);
 368                 }
 369                 if (__improbable(holder != thread)) {
 370                         panic("Lock not owned by current thread %p = %lx", lock, state);
 371                 }
 372         } else if (type == LCK_ASSERT_NOTOWNED) {
 373                 if (__improbable(holder != THREAD_NULL)) {
 374                         if (holder == thread) {
 375                                 panic("Lock owned by current thread %p = %lx", lock, state);
 376                         } else {
 377                                 panic("Lock %p owned by thread %p", lock, holder);
 378                         }
 379                 }
 380         }
 381 }
 382
 383 /*
 384  *      Routine: kdp_lck_spin_is_acquired
 385  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 386  *      Returns: TRUE if lock is acquired.
 387  */
 388 boolean_t
 389 kdp_lck_spin_is_acquired(lck_spin_t *lck) {
 390         if (not_in_kdp) {
 391                 panic("panic: spinlock acquired check done outside of kernel debugger");
 392         }
 393         return (lck->interlock != 0)? TRUE : FALSE;
 394 }
 395
 396 /*
 397  *      Initialize a usimple_lock.
 398  *
 399  *      No change in preemption state.
 400  */
 401 void
 402 usimple_lock_init(
 403         usimple_lock_t  l,
 404         __unused unsigned short tag)
 405 {
 406 #ifndef MACHINE_SIMPLE_LOCK
 407         USLDBG(usld_lock_init(l, tag));
 408         hw_lock_init(&l->interlock);
 409 #else
 410         simple_lock_init((simple_lock_t)l,tag);
 411 #endif
 412 }
 413
 414 volatile uint32_t spinlock_owner_cpu = ~0;
 415 volatile usimple_lock_t spinlock_timed_out;
 416
 417 uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
 418         uint32_t i;
 419
 420         for (i = 0; i < real_ncpus; i++) {
 421                 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
 422                         spinlock_owner_cpu = i;
 423                         if ((uint32_t) cpu_number() != i) {
 424                                 /* Cause NMI and panic on the owner's cpu */
 425                                 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
 426                         }
 427                         break;
 428                 }
 429         }
 430
 431         return spinlock_owner_cpu;
 432 }
 433
 434 /*
 435  *      Acquire a usimple_lock.
 436  *
 437  *      Returns with preemption disabled.  Note
 438  *      that the hw_lock routines are responsible for
 439  *      maintaining preemption state.
 440  */
 441 void
 442 usimple_lock(
 443         usimple_lock_t  l)
 444 {
 445 #ifndef MACHINE_SIMPLE_LOCK
 446         DECL_PC(pc);
 447
 448         OBTAIN_PC(pc);
 449         USLDBG(usld_lock_pre(l, pc));
 450
 451         if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))        {
 452                 boolean_t uslock_acquired = FALSE;
 453                 while (machine_timeout_suspended()) {
 454                         enable_preemption();
 455                         if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
 456                                 break;
 457                 }
 458
 459                 if (uslock_acquired == FALSE) {
 460                         uint32_t lock_cpu;
 461                         uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
 462                         spinlock_timed_out = l;
 463                         lock_cpu = spinlock_timeout_NMI(lowner);
 464                         panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
 465                               l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
 466                 }
 467         }
 468 #if DEVELOPMENT || DEBUG
 469                 pltrace(FALSE);
 470 #endif
 471
 472         USLDBG(usld_lock_post(l, pc));
 473 #else
 474         simple_lock((simple_lock_t)l);
 475 #endif
 476 #if CONFIG_DTRACE
 477         LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0);
 478 #endif
 479 }
 480
 481
 482 /*
 483  *      Release a usimple_lock.
 484  *
 485  *      Returns with preemption enabled.  Note
 486  *      that the hw_lock routines are responsible for
 487  *      maintaining preemption state.
 488  */
 489 void
 490 usimple_unlock(
 491         usimple_lock_t  l)
 492 {
 493 #ifndef MACHINE_SIMPLE_LOCK
 494         DECL_PC(pc);
 495
 496         OBTAIN_PC(pc);
 497         USLDBG(usld_unlock(l, pc));
 498 #if DEVELOPMENT || DEBUG
 499                 pltrace(TRUE);
 500 #endif
 501         hw_lock_unlock(&l->interlock);
 502 #else
 503         simple_unlock_rwmb((simple_lock_t)l);
 504 #endif
 505 }
 506
 507
 508 /*
 509  *      Conditionally acquire a usimple_lock.
 510  *
 511  *      On success, returns with preemption disabled.
 512  *      On failure, returns with preemption in the same state
 513  *      as when first invoked.  Note that the hw_lock routines
 514  *      are responsible for maintaining preemption state.
 515  *
 516  *      XXX No stats are gathered on a miss; I preserved this
 517  *      behavior from the original assembly-language code, but
 518  *      doesn't it make sense to log misses?  XXX
 519  */
 520 unsigned int
 521 usimple_lock_try(
 522         usimple_lock_t  l)
 523 {
 524 #ifndef MACHINE_SIMPLE_LOCK
 525         unsigned int    success;
 526         DECL_PC(pc);
 527
 528         OBTAIN_PC(pc);
 529         USLDBG(usld_lock_try_pre(l, pc));
 530         if ((success = hw_lock_try(&l->interlock))) {
 531 #if DEVELOPMENT || DEBUG
 532                 pltrace(FALSE);
 533 #endif
 534         USLDBG(usld_lock_try_post(l, pc));
 535         }
 536         return success;
 537 #else
 538         return(simple_lock_try((simple_lock_t)l));
 539 #endif
 540 }
 541
 542 /*
 543  * Acquire a usimple_lock while polling for pending TLB flushes
 544  * and spinning on a lock.
 545  *
 546  */
 547 void
 548 usimple_lock_try_lock_loop(usimple_lock_t l)
 549 {
 550         boolean_t istate = ml_get_interrupts_enabled();
 551         while (!simple_lock_try((l))) {
 552                 if (!istate)
 553                         handle_pending_TLB_flushes();
 554                 cpu_pause();
 555         }
 556 }
 557
 558 #if     USLOCK_DEBUG
 559 /*
 560  *      States of a usimple_lock.  The default when initializing
 561  *      a usimple_lock is setting it up for debug checking.
 562  */
 563 #define USLOCK_CHECKED          0x0001          /* lock is being checked */
 564 #define USLOCK_TAKEN            0x0002          /* lock has been taken */
 565 #define USLOCK_INIT             0xBAA0          /* lock has been initialized */
 566 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 567 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 568                                  ((l)->debug.state & USLOCK_CHECKED))
 569
 570 /*
 571  *      Trace activities of a particularly interesting lock.
 572  */
 573 void    usl_trace(usimple_lock_t, int, pc_t, const char *);
 574
 575
 576 /*
 577  *      Initialize the debugging information contained
 578  *      in a usimple_lock.
 579  */
 580 void
 581 usld_lock_init(
 582         usimple_lock_t  l,
 583         __unused unsigned short tag)
 584 {
 585         if (l == USIMPLE_LOCK_NULL)
 586                 panic("lock initialization:  null lock pointer");
 587         l->lock_type = USLOCK_TAG;
 588         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 589         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 590         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 591         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 592         l->debug.duration[0] = l->debug.duration[1] = 0;
 593         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 594         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 595         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 596 }
 597
 598
 599 /*
 600  *      These checks apply to all usimple_locks, not just
 601  *      those with USLOCK_CHECKED turned on.
 602  */
 603 int
 604 usld_lock_common_checks(
 605         usimple_lock_t  l,
 606         char            *caller)
 607 {
 608         if (l == USIMPLE_LOCK_NULL)
 609                 panic("%s:  null lock pointer", caller);
 610         if (l->lock_type != USLOCK_TAG)
 611                 panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
 612         if (!(l->debug.state & USLOCK_INIT))
 613                 panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
 614         return USLOCK_CHECKING(l);
 615 }
 616
 617
 618 /*
 619  *      Debug checks on a usimple_lock just before attempting
 620  *      to acquire it.
 621  */
 622 /* ARGSUSED */
 623 void
 624 usld_lock_pre(
 625         usimple_lock_t  l,
 626         pc_t            pc)
 627 {
 628         char    caller[] = "usimple_lock";
 629
 630
 631         if (!usld_lock_common_checks(l, caller))
 632                 return;
 633
 634 /*
 635  *      Note that we have a weird case where we are getting a lock when we are]
 636  *      in the process of putting the system to sleep. We are running with no
 637  *      current threads, therefore we can't tell if we are trying to retake a lock
 638  *      we have or someone on the other processor has it.  Therefore we just
 639  *      ignore this test if the locking thread is 0.
 640  */
 641
 642         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 643             l->debug.lock_thread == (void *) current_thread()) {
 644                 printf("%s:  lock %p already locked (at %p) by",
 645                       caller, l, l->debug.lock_pc);
 646                 printf(" current thread %p (new attempt at pc %p)\n",
 647                        l->debug.lock_thread, pc);
 648                 panic("%s", caller);
 649         }
 650         mp_disable_preemption();
 651         usl_trace(l, cpu_number(), pc, caller);
 652         mp_enable_preemption();
 653 }
 654
 655
 656 /*
 657  *      Debug checks on a usimple_lock just after acquiring it.
 658  *
 659  *      Pre-emption has been disabled at this point,
 660  *      so we are safe in using cpu_number.
 661  */
 662 void
 663 usld_lock_post(
 664         usimple_lock_t  l,
 665         pc_t            pc)
 666 {
 667         int     mycpu;
 668         char    caller[] = "successful usimple_lock";
 669
 670
 671         if (!usld_lock_common_checks(l, caller))
 672                 return;
 673
 674         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 675                 panic("%s:  lock %p became uninitialized",
 676                       caller, l);
 677         if ((l->debug.state & USLOCK_TAKEN))
 678                 panic("%s:  lock 0x%p became TAKEN by someone else",
 679                       caller, l);
 680
 681         mycpu = cpu_number();
 682         l->debug.lock_thread = (void *)current_thread();
 683         l->debug.state |= USLOCK_TAKEN;
 684         l->debug.lock_pc = pc;
 685         l->debug.lock_cpu = mycpu;
 686
 687         usl_trace(l, mycpu, pc, caller);
 688 }
 689
 690
 691 /*
 692  *      Debug checks on a usimple_lock just before
 693  *      releasing it.  Note that the caller has not
 694  *      yet released the hardware lock.
 695  *
 696  *      Preemption is still disabled, so there's
 697  *      no problem using cpu_number.
 698  */
 699 void
 700 usld_unlock(
 701         usimple_lock_t  l,
 702         pc_t            pc)
 703 {
 704         int     mycpu;
 705         char    caller[] = "usimple_unlock";
 706
 707
 708         if (!usld_lock_common_checks(l, caller))
 709                 return;
 710
 711         mycpu = cpu_number();
 712
 713         if (!(l->debug.state & USLOCK_TAKEN))
 714                 panic("%s:  lock 0x%p hasn't been taken",
 715                       caller, l);
 716         if (l->debug.lock_thread != (void *) current_thread())
 717                 panic("%s:  unlocking lock 0x%p, owned by thread %p",
 718                       caller, l, l->debug.lock_thread);
 719         if (l->debug.lock_cpu != mycpu) {
 720                 printf("%s:  unlocking lock 0x%p on cpu 0x%x",
 721                        caller, l, mycpu);
 722                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 723                 panic("%s", caller);
 724         }
 725         usl_trace(l, mycpu, pc, caller);
 726
 727         l->debug.unlock_thread = l->debug.lock_thread;
 728         l->debug.lock_thread = INVALID_PC;
 729         l->debug.state &= ~USLOCK_TAKEN;
 730         l->debug.unlock_pc = pc;
 731         l->debug.unlock_cpu = mycpu;
 732 }
 733
 734
 735 /*
 736  *      Debug checks on a usimple_lock just before
 737  *      attempting to acquire it.
 738  *
 739  *      Preemption isn't guaranteed to be disabled.
 740  */
 741 void
 742 usld_lock_try_pre(
 743         usimple_lock_t  l,
 744         pc_t            pc)
 745 {
 746         char    caller[] = "usimple_lock_try";
 747
 748         if (!usld_lock_common_checks(l, caller))
 749                 return;
 750         mp_disable_preemption();
 751         usl_trace(l, cpu_number(), pc, caller);
 752         mp_enable_preemption();
 753 }
 754
 755
 756 /*
 757  *      Debug checks on a usimple_lock just after
 758  *      successfully attempting to acquire it.
 759  *
 760  *      Preemption has been disabled by the
 761  *      lock acquisition attempt, so it's safe
 762  *      to use cpu_number.
 763  */
 764 void
 765 usld_lock_try_post(
 766         usimple_lock_t  l,
 767         pc_t            pc)
 768 {
 769         int     mycpu;
 770         char    caller[] = "successful usimple_lock_try";
 771
 772         if (!usld_lock_common_checks(l, caller))
 773                 return;
 774
 775         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 776                 panic("%s:  lock 0x%p became uninitialized",
 777                       caller, l);
 778         if ((l->debug.state & USLOCK_TAKEN))
 779                 panic("%s:  lock 0x%p became TAKEN by someone else",
 780                       caller, l);
 781
 782         mycpu = cpu_number();
 783         l->debug.lock_thread = (void *) current_thread();
 784         l->debug.state |= USLOCK_TAKEN;
 785         l->debug.lock_pc = pc;
 786         l->debug.lock_cpu = mycpu;
 787
 788         usl_trace(l, mycpu, pc, caller);
 789 }
 790
 791
 792 /*
 793  *      For very special cases, set traced_lock to point to a
 794  *      specific lock of interest.  The result is a series of
 795  *      XPRs showing lock operations on that lock.  The lock_seq
 796  *      value is used to show the order of those operations.
 797  */
 798 usimple_lock_t          traced_lock;
 799 unsigned int            lock_seq;
 800
 801 void
 802 usl_trace(
 803         usimple_lock_t  l,
 804         int             mycpu,
 805         pc_t            pc,
 806         const char *    op_name)
 807 {
 808         if (traced_lock == l) {
 809                 XPR(XPR_SLOCK,
 810                     "seq %d, cpu %d, %s @ %x\n",
 811                     (uintptr_t) lock_seq, (uintptr_t) mycpu,
 812                     (uintptr_t) op_name, (uintptr_t) pc, 0);
 813                 lock_seq++;
 814         }
 815 }
 816
 817
 818 #endif  /* USLOCK_DEBUG */
 819
 820 /*
 821  *      Routine:        lck_rw_alloc_init
 822  */
 823 lck_rw_t *
 824 lck_rw_alloc_init(
 825         lck_grp_t       *grp,
 826         lck_attr_t      *attr) {
 827         lck_rw_t        *lck;
 828
 829         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 830                 bzero(lck, sizeof(lck_rw_t));
 831                 lck_rw_init(lck, grp, attr);
 832         }
 833
 834         return(lck);
 835 }
 836
 837 /*
 838  *      Routine:        lck_rw_free
 839  */
 840 void
 841 lck_rw_free(
 842         lck_rw_t        *lck,
 843         lck_grp_t       *grp) {
 844         lck_rw_destroy(lck, grp);
 845         kfree(lck, sizeof(lck_rw_t));
 846 }
 847
 848 /*
 849  *      Routine:        lck_rw_init
 850  */
 851 void
 852 lck_rw_init(
 853         lck_rw_t        *lck,
 854         lck_grp_t       *grp,
 855         lck_attr_t      *attr)
 856 {
 857         lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
 858                                         attr : &LockDefaultLckAttr;
 859
 860         hw_lock_byte_init(&lck->lck_rw_interlock);
 861         lck->lck_rw_want_write = FALSE;
 862         lck->lck_rw_want_upgrade = FALSE;
 863         lck->lck_rw_shared_count = 0;
 864         lck->lck_rw_can_sleep = TRUE;
 865         lck->lck_r_waiting = lck->lck_w_waiting = 0;
 866         lck->lck_rw_tag = 0;
 867         lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
 868                                 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
 869
 870         lck_grp_reference(grp);
 871         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 872 }
 873
 874 /*
 875  *      Routine:        lck_rw_destroy
 876  */
 877 void
 878 lck_rw_destroy(
 879         lck_rw_t        *lck,
 880         lck_grp_t       *grp)
 881 {
 882         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
 883                 return;
 884 #if MACH_LDEBUG
 885         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 886 #endif
 887         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 888         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 889         lck_grp_deallocate(grp);
 890         return;
 891 }
 892
 893 /*
 894  *      Sleep locks.  These use the same data structure and algorithm
 895  *      as the spin locks, but the process sleeps while it is waiting
 896  *      for the lock.  These work on uniprocessor systems.
 897  */
 898
 899 #define DECREMENTER_TIMEOUT 1000000
 900
 901 /*
 902  * We disable interrupts while holding the RW interlock to prevent an
 903  * interrupt from exacerbating hold time.
 904  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 905  */
 906 static inline boolean_t
 907 lck_interlock_lock(lck_rw_t *lck)
 908 {
 909         boolean_t       istate;
 910
 911         istate = ml_set_interrupts_enabled(FALSE);
 912         hw_lock_byte_lock(&lck->lck_rw_interlock);
 913         return istate;
 914 }
 915
 916 static inline void
 917 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 918 {
 919         hw_lock_byte_unlock(&lck->lck_rw_interlock);
 920         ml_set_interrupts_enabled(istate);
 921 }
 922
 923 /*
 924  * This inline is used when busy-waiting for an rw lock.
 925  * If interrupts were disabled when the lock primitive was called,
 926  * we poll the IPI handler for pending tlb flushes.
 927  * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
 928  */
 929 static inline void
 930 lck_rw_lock_pause(boolean_t interrupts_enabled)
 931 {
 932         if (!interrupts_enabled)
 933                 handle_pending_TLB_flushes();
 934         cpu_pause();
 935 }
 936
 937 static inline boolean_t
 938 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
 939 {
 940         if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE))
 941                 return TRUE;
 942         return FALSE;
 943 }
 944
 945 /*
 946  * compute the deadline to spin against when
 947  * waiting for a change of state on a lck_rw_t
 948  */
 949 static inline uint64_t
 950 lck_rw_deadline_for_spin(lck_rw_t *lck)
 951 {
 952         if (lck->lck_rw_can_sleep) {
 953                 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
 954                         /*
 955                          * there are already threads waiting on this lock... this
 956                          * implies that they have spun beyond their deadlines waiting for
 957                          * the desired state to show up so we will not bother spinning at this time...
 958                          *   or
 959                          * the current number of threads sharing this lock exceeds our capacity to run them
 960                          * concurrently and since all states we're going to spin for require the rw_shared_count
 961                          * to be at 0, we'll not bother spinning since the latency for this to happen is
 962                          * unpredictable...
 963                          */
 964                         return (mach_absolute_time());
 965                 }
 966                 return (mach_absolute_time() + MutexSpin);
 967         } else
 968                 return (mach_absolute_time() + (100000LL * 1000000000LL));
 969 }
 970
 971
 972 /*
 973  * Spin while interlock is held.
 974  */
 975
 976 static inline void
 977 lck_rw_interlock_spin(lck_rw_t *lock)
 978 {
 979         while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
 980                 cpu_pause();
 981         }
 982 }
 983
 984 static boolean_t
 985 lck_rw_grab_want(lck_rw_t *lock)
 986 {
 987         uint32_t        data, prev;
 988
 989         for ( ; ; ) {
 990                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
 991                 if ((data & LCK_RW_INTERLOCK) == 0)
 992                         break;
 993                 atomic_exchange_abort();
 994                 lck_rw_interlock_spin(lock);
 995         }
 996         if (data & LCK_RW_WANT_WRITE) {
 997                 atomic_exchange_abort();
 998                 return FALSE;
 999         }
1000         data |= LCK_RW_WANT_WRITE;
1001         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1002 }
1003
1004 static boolean_t
1005 lck_rw_grab_shared(lck_rw_t *lock)
1006 {
1007         uint32_t        data, prev;
1008
1009         for ( ; ; ) {
1010                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1011                 if ((data & LCK_RW_INTERLOCK) == 0)
1012                         break;
1013                 atomic_exchange_abort();
1014                 lck_rw_interlock_spin(lock);
1015         }
1016         if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1017                 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1018                         atomic_exchange_abort();
1019                         return FALSE;
1020                 }
1021         }
1022         data += LCK_RW_SHARED_READER;
1023         return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1024 }
1025
1026 /*
1027  *      Routine:        lck_rw_lock_exclusive
1028  */
1029 static void
1030 lck_rw_lock_exclusive_gen(
1031         lck_rw_t        *lck)
1032 {
1033         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1034         uint64_t        deadline = 0;
1035         int             slept = 0;
1036         int             gotlock = 0;
1037         int             lockheld = 0;
1038         wait_result_t   res = 0;
1039         boolean_t       istate = -1;
1040
1041 #if     CONFIG_DTRACE
1042         boolean_t dtrace_ls_initialized = FALSE;
1043         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
1044         uint64_t wait_interval = 0;
1045         int readers_at_sleep = 0;
1046 #endif
1047
1048         /*
1049          *      Try to acquire the lck_rw_want_write bit.
1050          */
1051         while ( !lck_rw_grab_want(lck)) {
1052
1053 #if     CONFIG_DTRACE
1054                 if (dtrace_ls_initialized == FALSE) {
1055                         dtrace_ls_initialized = TRUE;
1056                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1057                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1058                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1059                         if (dtrace_ls_enabled) {
1060                                 /*
1061                                  * Either sleeping or spinning is happening,
1062                                  *  start a timing of our delay interval now.
1063                                  */
1064                                 readers_at_sleep = lck->lck_rw_shared_count;
1065                                 wait_interval = mach_absolute_time();
1066                         }
1067                 }
1068 #endif
1069                 if (istate == -1)
1070                         istate = ml_get_interrupts_enabled();
1071
1072                 deadline = lck_rw_deadline_for_spin(lck);
1073
1074                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1075
1076                 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
1077                         lck_rw_lock_pause(istate);
1078
1079                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1080
1081                 if (gotlock)
1082                         break;
1083                 /*
1084                  * if we get here, the deadline has expired w/o us
1085                  * being able to grab the lock exclusively
1086                  * check to see if we're allowed to do a thread_block
1087                  */
1088                 if (lck->lck_rw_can_sleep) {
1089
1090                         istate = lck_interlock_lock(lck);
1091
1092                         if (lck->lck_rw_want_write) {
1093
1094                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1095
1096                                 lck->lck_w_waiting = TRUE;
1097
1098                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1099                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1100                                 lck_interlock_unlock(lck, istate);
1101
1102                                 if (res == THREAD_WAITING) {
1103                                         res = thread_block(THREAD_CONTINUE_NULL);
1104                                         slept++;
1105                                 }
1106                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1107                         } else {
1108                                 lck->lck_rw_want_write = TRUE;
1109                                 lck_interlock_unlock(lck, istate);
1110                                 break;
1111                         }
1112                 }
1113         }
1114         /*
1115          * Wait for readers (and upgrades) to finish...
1116          * the test for these conditions must be done simultaneously with
1117          * a check of the interlock not being held since
1118          * the rw_shared_count will drop to 0 first and then want_upgrade
1119          * will be set to 1 in the shared_to_exclusive scenario... those
1120          * adjustments are done behind the interlock and represent an
1121          * atomic change in state and must be considered as such
1122          * however, once we see the read count at 0, the want_upgrade not set
1123          * and the interlock not held, we are safe to proceed
1124          */
1125         while (lck_rw_held_read_or_upgrade(lck)) {
1126
1127 #if     CONFIG_DTRACE
1128                 /*
1129                  * Either sleeping or spinning is happening, start
1130                  * a timing of our delay interval now.  If we set it
1131                  * to -1 we don't have accurate data so we cannot later
1132                  * decide to record a dtrace spin or sleep event.
1133                  */
1134                 if (dtrace_ls_initialized == FALSE) {
1135                         dtrace_ls_initialized = TRUE;
1136                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1137                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1138                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1139                         if (dtrace_ls_enabled) {
1140                                 /*
1141                                  * Either sleeping or spinning is happening,
1142                                  *  start a timing of our delay interval now.
1143                                  */
1144                                 readers_at_sleep = lck->lck_rw_shared_count;
1145                                 wait_interval = mach_absolute_time();
1146                         }
1147                 }
1148 #endif
1149                 if (istate == -1)
1150                         istate = ml_get_interrupts_enabled();
1151
1152                 deadline = lck_rw_deadline_for_spin(lck);
1153
1154                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1155
1156                 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1157                         lck_rw_lock_pause(istate);
1158
1159                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1160
1161                 if ( !lockheld)
1162                         break;
1163                 /*
1164                  * if we get here, the deadline has expired w/o us
1165                  * being able to grab the lock exclusively
1166                  * check to see if we're allowed to do a thread_block
1167                  */
1168                 if (lck->lck_rw_can_sleep) {
1169
1170                         istate = lck_interlock_lock(lck);
1171
1172                         if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1173                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1174
1175                                 lck->lck_w_waiting = TRUE;
1176
1177                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1178                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1179                                 lck_interlock_unlock(lck, istate);
1180
1181                                 if (res == THREAD_WAITING) {
1182                                         res = thread_block(THREAD_CONTINUE_NULL);
1183                                         slept++;
1184                                 }
1185                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1186                         } else {
1187                                 lck_interlock_unlock(lck, istate);
1188                                 /*
1189                                  * must own the lock now, since we checked for
1190                                  * readers or upgrade owner behind the interlock
1191                                  * no need for a call to 'lck_rw_held_read_or_upgrade'
1192                                  */
1193                                 break;
1194                         }
1195                 }
1196         }
1197
1198 #if     CONFIG_DTRACE
1199         /*
1200          * Decide what latencies we suffered that are Dtrace events.
1201          * If we have set wait_interval, then we either spun or slept.
1202          * At least we get out from under the interlock before we record
1203          * which is the best we can do here to minimize the impact
1204          * of the tracing.
1205          * If we have set wait_interval to -1, then dtrace was not enabled when we
1206          * started sleeping/spinning so we don't record this event.
1207          */
1208         if (dtrace_ls_enabled == TRUE) {
1209                 if (slept == 0) {
1210                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1211                             mach_absolute_time() - wait_interval, 1);
1212                 } else {
1213                         /*
1214                          * For the blocking case, we also record if when we blocked
1215                          * it was held for read or write, and how many readers.
1216                          * Notice that above we recorded this before we dropped
1217                          * the interlock so the count is accurate.
1218                          */
1219                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1220                             mach_absolute_time() - wait_interval, 1,
1221                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1222                 }
1223         }
1224         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1225 #endif
1226 }
1227
1228 /*
1229  *      Routine:        lck_rw_done
1230  */
1231
1232 lck_rw_type_t lck_rw_done(lck_rw_t *lock)
1233 {
1234         uint32_t        data, prev;
1235
1236         for ( ; ; ) {
1237                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1238                 if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
1239                         atomic_exchange_abort();
1240                         lck_rw_interlock_spin(lock);
1241                         continue;
1242                 }
1243                 if (data & LCK_RW_SHARED_MASK) {
1244                         data -= LCK_RW_SHARED_READER;
1245                         if ((data & LCK_RW_SHARED_MASK) == 0)   /* if reader count has now gone to 0, check for waiters */
1246                                 goto check_waiters;
1247                 } else {                                        /* if reader count == 0, must be exclusive lock */
1248                         if (data & LCK_RW_WANT_UPGRADE) {
1249                                 data &= ~(LCK_RW_WANT_UPGRADE);
1250                         } else {
1251                                 if (data & LCK_RW_WANT_WRITE)
1252                                         data &= ~(LCK_RW_WANT_EXCL);
1253                                 else                                    /* lock is not 'owned', panic */
1254                                         panic("Releasing non-exclusive RW lock without a reader refcount!");
1255                         }
1256 check_waiters:
1257                         if (prev & LCK_RW_W_WAITING) {
1258                                 data &= ~(LCK_RW_W_WAITING);
1259                                 if ((prev & LCK_RW_PRIV_EXCL) == 0)
1260                                         data &= ~(LCK_RW_R_WAITING);
1261                         } else
1262                                 data &= ~(LCK_RW_R_WAITING);
1263                 }
1264                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1265                         break;
1266                 cpu_pause();
1267         }
1268         return lck_rw_done_gen(lock, prev);
1269 }
1270
1271 /*
1272  *      Routine:        lck_rw_done_gen
1273  *
1274  *      called from lck_rw_done()
1275  *      prior_lock_state is the value in the 1st
1276  *      word of the lock at the time of a successful
1277  *      atomic compare and exchange with the new value...
1278  *      it represents the state of the lock before we
1279  *      decremented the rw_shared_count or cleared either
1280  *      rw_want_upgrade or rw_want_write and
1281  *      the lck_x_waiting bits...  since the wrapper
1282  *      routine has already changed the state atomically,
1283  *      we just need to decide if we should
1284  *      wake up anyone and what value to return... we do
1285  *      this by examining the state of the lock before
1286  *      we changed it
1287  */
1288 static lck_rw_type_t
1289 lck_rw_done_gen(
1290         lck_rw_t        *lck,
1291         uint32_t        prior_lock_state)
1292 {
1293         lck_rw_t        *fake_lck;
1294         lck_rw_type_t   lock_type;
1295         thread_t        thread;
1296         uint32_t        rwlock_count;
1297
1298         /*
1299          * prior_lock state is a snapshot of the 1st word of the
1300          * lock in question... we'll fake up a pointer to it
1301          * and carefully not access anything beyond whats defined
1302          * in the first word of a lck_rw_t
1303          */
1304         fake_lck = (lck_rw_t *)&prior_lock_state;
1305
1306         if (fake_lck->lck_rw_shared_count <= 1) {
1307                 if (fake_lck->lck_w_waiting)
1308                         thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1309
1310                 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1311                         thread_wakeup(RW_LOCK_READER_EVENT(lck));
1312         }
1313         if (fake_lck->lck_rw_shared_count)
1314                 lock_type = LCK_RW_TYPE_SHARED;
1315         else
1316                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1317
1318         /* Check if dropping the lock means that we need to unpromote */
1319         thread = current_thread();
1320         rwlock_count = thread->rwlock_count--;
1321 #if MACH_LDEBUG
1322         if (rwlock_count == 0) {
1323                 panic("rw lock count underflow for thread %p", thread);
1324         }
1325 #endif
1326         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1327                 /* sched_flags checked without lock, but will be rechecked while clearing */
1328                 lck_rw_clear_promotion(thread);
1329         }
1330
1331 #if CONFIG_DTRACE
1332         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1333 #endif
1334
1335         return(lock_type);
1336 }
1337
1338
1339 /*
1340  *      Routine:        lck_rw_unlock
1341  */
1342 void
1343 lck_rw_unlock(
1344         lck_rw_t        *lck,
1345         lck_rw_type_t   lck_rw_type)
1346 {
1347         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1348                 lck_rw_unlock_shared(lck);
1349         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1350                 lck_rw_unlock_exclusive(lck);
1351         else
1352                 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1353 }
1354
1355
1356 /*
1357  *      Routine:        lck_rw_unlock_shared
1358  */
1359 void
1360 lck_rw_unlock_shared(
1361         lck_rw_t        *lck)
1362 {
1363         lck_rw_type_t   ret;
1364
1365         assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1366         ret = lck_rw_done(lck);
1367
1368         if (ret != LCK_RW_TYPE_SHARED)
1369                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1370 }
1371
1372
1373 /*
1374  *      Routine:        lck_rw_unlock_exclusive
1375  */
1376 void
1377 lck_rw_unlock_exclusive(
1378         lck_rw_t        *lck)
1379 {
1380         lck_rw_type_t   ret;
1381
1382         ret = lck_rw_done(lck);
1383
1384         if (ret != LCK_RW_TYPE_EXCLUSIVE)
1385                 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1386 }
1387
1388
1389 /*
1390  *      Routine:        lck_rw_lock
1391  */
1392 void
1393 lck_rw_lock(
1394         lck_rw_t        *lck,
1395         lck_rw_type_t   lck_rw_type)
1396 {
1397         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1398                 lck_rw_lock_shared(lck);
1399         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1400                 lck_rw_lock_exclusive(lck);
1401         else
1402                 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1403 }
1404
1405 /*
1406  *      Routine:        lck_rw_lock_shared
1407  */
1408 void
1409 lck_rw_lock_shared(lck_rw_t *lock)
1410 {
1411         uint32_t        data, prev;
1412
1413         current_thread()->rwlock_count++;
1414         for ( ; ; ) {
1415                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1416                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1417                         atomic_exchange_abort();
1418                         lck_rw_lock_shared_gen(lock);
1419                         break;
1420                 }
1421                 data += LCK_RW_SHARED_READER;
1422                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1423                         break;
1424                 cpu_pause();
1425         }
1426 #if     CONFIG_DTRACE
1427         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1428 #endif  /* CONFIG_DTRACE */
1429         return;
1430 }
1431
1432 /*
1433  *      Routine:        lck_rw_lock_shared_gen
1434  *      Function:
1435  *              assembly fast path code has determined that this lock
1436  *              is held exclusively... this is where we spin/block
1437  *              until we can acquire the lock in the shared mode
1438  */
1439 static void
1440 lck_rw_lock_shared_gen(
1441         lck_rw_t        *lck)
1442 {
1443         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1444         uint64_t        deadline = 0;
1445         int             gotlock = 0;
1446         int             slept = 0;
1447         wait_result_t   res = 0;
1448         boolean_t       istate = -1;
1449
1450 #if     CONFIG_DTRACE
1451         uint64_t wait_interval = 0;
1452         int readers_at_sleep = 0;
1453         boolean_t dtrace_ls_initialized = FALSE;
1454         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1455 #endif
1456
1457         while ( !lck_rw_grab_shared(lck)) {
1458
1459 #if     CONFIG_DTRACE
1460                 if (dtrace_ls_initialized == FALSE) {
1461                         dtrace_ls_initialized = TRUE;
1462                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1463                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1464                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1465                         if (dtrace_ls_enabled) {
1466                                 /*
1467                                  * Either sleeping or spinning is happening,
1468                                  *  start a timing of our delay interval now.
1469                                  */
1470                                 readers_at_sleep = lck->lck_rw_shared_count;
1471                                 wait_interval = mach_absolute_time();
1472                         }
1473                 }
1474 #endif
1475                 if (istate == -1)
1476                         istate = ml_get_interrupts_enabled();
1477
1478                 deadline = lck_rw_deadline_for_spin(lck);
1479
1480                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1481                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1482
1483                 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1484                         lck_rw_lock_pause(istate);
1485
1486                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1487                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1488
1489                 if (gotlock)
1490                         break;
1491                 /*
1492                  * if we get here, the deadline has expired w/o us
1493                  * being able to grab the lock for read
1494                  * check to see if we're allowed to do a thread_block
1495                  */
1496                 if (lck->lck_rw_can_sleep) {
1497
1498                         istate = lck_interlock_lock(lck);
1499
1500                         if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1501                             ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1502
1503                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1504                                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1505
1506                                 lck->lck_r_waiting = TRUE;
1507
1508                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1509                                 res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1510                                 lck_interlock_unlock(lck, istate);
1511
1512                                 if (res == THREAD_WAITING) {
1513                                         res = thread_block(THREAD_CONTINUE_NULL);
1514                                         slept++;
1515                                 }
1516                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1517                                              trace_lck, res, slept, 0, 0);
1518                         } else {
1519                                 lck->lck_rw_shared_count++;
1520                                 lck_interlock_unlock(lck, istate);
1521                                 break;
1522                         }
1523                 }
1524         }
1525
1526 #if     CONFIG_DTRACE
1527         if (dtrace_ls_enabled == TRUE) {
1528                 if (slept == 0) {
1529                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1530                 } else {
1531                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1532                             mach_absolute_time() - wait_interval, 0,
1533                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1534                 }
1535         }
1536         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1537 #endif
1538 }
1539
1540
1541 /*
1542  *      Routine:        lck_rw_lock_exclusive
1543  */
1544
1545 void
1546 lck_rw_lock_exclusive(lck_rw_t *lock)
1547 {
1548         current_thread()->rwlock_count++;
1549         if (atomic_test_and_set32(&lock->data,
1550                 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1551                 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1552 #if     CONFIG_DTRACE
1553                 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1554 #endif  /* CONFIG_DTRACE */
1555         } else
1556                 lck_rw_lock_exclusive_gen(lock);
1557 }
1558
1559
1560 /*
1561  *      Routine:        lck_rw_lock_shared_to_exclusive
1562  */
1563
1564 boolean_t
1565 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1566 {
1567         uint32_t        data, prev;
1568
1569         for ( ; ; ) {
1570                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1571                 if (data & LCK_RW_INTERLOCK) {
1572                         atomic_exchange_abort();
1573                         lck_rw_interlock_spin(lock);
1574                         continue;
1575                 }
1576                 if (data & LCK_RW_WANT_UPGRADE) {
1577                         data -= LCK_RW_SHARED_READER;
1578                         if ((data & LCK_RW_SHARED_MASK) == 0)           /* we were the last reader */
1579                                 data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1580                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1581                                 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1582                 } else {
1583                         data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1584                         data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1585                         if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1586                                 break;
1587                 }
1588                 cpu_pause();
1589         }
1590                                                 /* we now own the WANT_UPGRADE */
1591         if (data & LCK_RW_SHARED_MASK)          /* check to see if all of the readers are drained */
1592                 lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1593 #if     CONFIG_DTRACE
1594         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1595 #endif
1596         return TRUE;
1597 }
1598
1599
1600 /*
1601  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1602  *      Function:
1603  *              assembly fast path code has already dropped our read
1604  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1605  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1606  *              all we need to do here is determine if a wakeup is needed
1607  */
1608 static boolean_t
1609 lck_rw_lock_shared_to_exclusive_failure(
1610         lck_rw_t        *lck,
1611         uint32_t        prior_lock_state)
1612 {
1613         lck_rw_t        *fake_lck;
1614         thread_t        thread = current_thread();
1615         uint32_t        rwlock_count;
1616
1617         /* Check if dropping the lock means that we need to unpromote */
1618         rwlock_count = thread->rwlock_count--;
1619 #if MACH_LDEBUG
1620         if (rwlock_count == 0) {
1621                 panic("rw lock count underflow for thread %p", thread);
1622         }
1623 #endif
1624         fake_lck = (lck_rw_t *)&prior_lock_state;
1625
1626         if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1627                 /*
1628                  *      Someone else has requested upgrade.
1629                  *      Since we've released the read lock, wake
1630                  *      him up if he's blocked waiting
1631                  */
1632                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1633         }
1634
1635         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1636                 /* sched_flags checked without lock, but will be rechecked while clearing */
1637                 lck_rw_clear_promotion(thread);
1638         }
1639
1640         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1641                      VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1642
1643         return (FALSE);
1644 }
1645
1646
1647 /*
1648  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1649  *      Function:
1650  *              assembly fast path code has already dropped our read
1651  *              count and successfully acquired 'lck_rw_want_upgrade'
1652  *              we just need to wait for the rest of the readers to drain
1653  *              and then we can return as the exclusive holder of this lock
1654  */
1655 static boolean_t
1656 lck_rw_lock_shared_to_exclusive_success(
1657         lck_rw_t        *lck)
1658 {
1659         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1660         uint64_t        deadline = 0;
1661         int             slept = 0;
1662         int             still_shared = 0;
1663         wait_result_t   res;
1664         boolean_t       istate = -1;
1665
1666 #if     CONFIG_DTRACE
1667         uint64_t wait_interval = 0;
1668         int readers_at_sleep = 0;
1669         boolean_t dtrace_ls_initialized = FALSE;
1670         boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1671 #endif
1672
1673         while (lck->lck_rw_shared_count != 0) {
1674
1675 #if     CONFIG_DTRACE
1676                 if (dtrace_ls_initialized == FALSE) {
1677                         dtrace_ls_initialized = TRUE;
1678                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1679                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1680                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1681                         if (dtrace_ls_enabled) {
1682                                 /*
1683                                  * Either sleeping or spinning is happening,
1684                                  *  start a timing of our delay interval now.
1685                                  */
1686                                 readers_at_sleep = lck->lck_rw_shared_count;
1687                                 wait_interval = mach_absolute_time();
1688                         }
1689                 }
1690 #endif
1691                 if (istate == -1)
1692                         istate = ml_get_interrupts_enabled();
1693
1694                 deadline = lck_rw_deadline_for_spin(lck);
1695
1696                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1697                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1698
1699                 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1700                         lck_rw_lock_pause(istate);
1701
1702                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1703                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1704
1705                 if ( !still_shared)
1706                         break;
1707                 /*
1708                  * if we get here, the deadline has expired w/o
1709                  * the rw_shared_count having drained to 0
1710                  * check to see if we're allowed to do a thread_block
1711                  */
1712                 if (lck->lck_rw_can_sleep) {
1713
1714                         istate = lck_interlock_lock(lck);
1715
1716                         if (lck->lck_rw_shared_count != 0) {
1717                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1718                                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1719
1720                                 lck->lck_w_waiting = TRUE;
1721
1722                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1723                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1724                                 lck_interlock_unlock(lck, istate);
1725
1726                                 if (res == THREAD_WAITING) {
1727                                         res = thread_block(THREAD_CONTINUE_NULL);
1728                                         slept++;
1729                                 }
1730                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1731                                              trace_lck, res, slept, 0, 0);
1732                         } else {
1733                                 lck_interlock_unlock(lck, istate);
1734                                 break;
1735                         }
1736                 }
1737         }
1738 #if     CONFIG_DTRACE
1739         /*
1740          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1741          */
1742         if (dtrace_ls_enabled == TRUE) {
1743                 if (slept == 0) {
1744                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1745                 } else {
1746                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1747                             mach_absolute_time() - wait_interval, 1,
1748                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1749                 }
1750         }
1751         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1752 #endif
1753         return (TRUE);
1754 }
1755
1756 /*
1757  *      Routine:        lck_rw_lock_exclusive_to_shared
1758  */
1759
1760 void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1761 {
1762         uint32_t        data, prev;
1763
1764         for ( ; ; ) {
1765                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1766                 if (data & LCK_RW_INTERLOCK) {
1767                         atomic_exchange_abort();
1768                         lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1769                         continue;
1770                 }
1771                 data += LCK_RW_SHARED_READER;
1772                 if (data & LCK_RW_WANT_UPGRADE)
1773                         data &= ~(LCK_RW_WANT_UPGRADE);
1774                 else
1775                         data &= ~(LCK_RW_WANT_EXCL);
1776                 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL)))
1777                         data &= ~(LCK_RW_W_WAITING);
1778                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1779                         break;
1780                 cpu_pause();
1781         }
1782         return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1783 }
1784
1785
1786 /*
1787  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1788  *      Function:
1789  *              assembly fast path has already dropped
1790  *              our exclusive state and bumped lck_rw_shared_count
1791  *              all we need to do here is determine if anyone
1792  *              needs to be awakened.
1793  */
1794 static void
1795 lck_rw_lock_exclusive_to_shared_gen(
1796         lck_rw_t        *lck,
1797         uint32_t        prior_lock_state)
1798 {
1799         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1800         lck_rw_t                *fake_lck;
1801
1802         fake_lck = (lck_rw_t *)&prior_lock_state;
1803
1804         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1805                              trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1806
1807         /*
1808          * don't wake up anyone waiting to take the lock exclusively
1809          * since we hold a read count... when the read count drops to 0,
1810          * the writers will be woken.
1811          *
1812          * wake up any waiting readers if we don't have any writers waiting,
1813          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1814          */
1815         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1816                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1817
1818         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1819                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1820
1821 #if CONFIG_DTRACE
1822         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1823 #endif
1824 }
1825
1826
1827 /*
1828  *      Routine:        lck_rw_try_lock
1829  */
1830 boolean_t
1831 lck_rw_try_lock(
1832         lck_rw_t        *lck,
1833         lck_rw_type_t   lck_rw_type)
1834 {
1835         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1836                 return(lck_rw_try_lock_shared(lck));
1837         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1838                 return(lck_rw_try_lock_exclusive(lck));
1839         else
1840                 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1841         return(FALSE);
1842 }
1843
1844 /*
1845  *      Routine:        lck_rw_try_lock_shared
1846  */
1847
1848 boolean_t lck_rw_try_lock_shared(lck_rw_t *lock)
1849 {
1850         uint32_t        data, prev;
1851
1852         for ( ; ; ) {
1853                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1854                 if (data & LCK_RW_INTERLOCK) {
1855                         atomic_exchange_abort();
1856                         lck_rw_interlock_spin(lock);
1857                         continue;
1858                 }
1859                 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1860                         atomic_exchange_abort();
1861                         return FALSE;                   /* lock is busy */
1862                 }
1863                 data += LCK_RW_SHARED_READER;           /* Increment reader refcount */
1864                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1865                         break;
1866                 cpu_pause();
1867         }
1868         current_thread()->rwlock_count++;
1869         /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1870 #if     CONFIG_DTRACE
1871         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1872 #endif  /* CONFIG_DTRACE */
1873         return TRUE;
1874 }
1875
1876
1877 /*
1878  *      Routine:        lck_rw_try_lock_exclusive
1879  */
1880
1881 boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock)
1882 {
1883         uint32_t        data, prev;
1884
1885         for ( ; ; ) {
1886                 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1887                 if (data & LCK_RW_INTERLOCK) {
1888                         atomic_exchange_abort();
1889                         lck_rw_interlock_spin(lock);
1890                         continue;
1891                 }
1892                 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1893                         atomic_exchange_abort();
1894                         return FALSE;                           /* can't get it */
1895                 }
1896                 data |= LCK_RW_WANT_EXCL;
1897                 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1898                         break;
1899                 cpu_pause();
1900         }
1901
1902         current_thread()->rwlock_count++;
1903 #if     CONFIG_DTRACE
1904         LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1905 #endif  /* CONFIG_DTRACE */
1906         return TRUE;
1907 }
1908
1909
1910 void
1911 lck_rw_assert(
1912         lck_rw_t        *lck,
1913         unsigned int    type)
1914 {
1915         switch (type) {
1916         case LCK_RW_ASSERT_SHARED:
1917                 if (lck->lck_rw_shared_count != 0) {
1918                         return;
1919                 }
1920                 break;
1921         case LCK_RW_ASSERT_EXCLUSIVE:
1922                 if ((lck->lck_rw_want_write ||
1923                      lck->lck_rw_want_upgrade) &&
1924                     lck->lck_rw_shared_count == 0) {
1925                         return;
1926                 }
1927                 break;
1928         case LCK_RW_ASSERT_HELD:
1929                 if (lck->lck_rw_want_write ||
1930                     lck->lck_rw_want_upgrade ||
1931                     lck->lck_rw_shared_count != 0) {
1932                         return;
1933                 }
1934                 break;
1935         case LCK_RW_ASSERT_NOTHELD:
1936                 if (!(lck->lck_rw_want_write ||
1937                           lck->lck_rw_want_upgrade ||
1938                           lck->lck_rw_shared_count != 0)) {
1939                         return;
1940                 }
1941                 break;
1942         default:
1943                 break;
1944         }
1945
1946         panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1947 }
1948
1949 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1950 void
1951 lck_rw_clear_promotions_x86(thread_t thread)
1952 {
1953 #if MACH_LDEBUG
1954         /* It's fatal to leave a RW lock locked and return to userspace */
1955         panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1956 #else
1957         /* Paper over the issue */
1958         thread->rwlock_count = 0;
1959         lck_rw_clear_promotion(thread);
1960 #endif
1961 }
1962
1963 boolean_t
1964 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
1965 {
1966         lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
1967
1968         if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
1969                 lck_rw_unlock_shared(lck);
1970                 mutex_pause(2);
1971                 lck_rw_lock_shared(lck);
1972                 return TRUE;
1973         }
1974
1975         return FALSE;
1976 }
1977
1978 /*
1979  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1980  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1981  */
1982 boolean_t
1983 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
1984         if (not_in_kdp) {
1985                 panic("panic: rw lock exclusive check done outside of kernel debugger");
1986         }
1987         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
1988 }
1989
1990
1991 #ifdef  MUTEX_ZONE
1992 extern zone_t lck_mtx_zone;
1993 #endif
1994 /*
1995  *      Routine:        lck_mtx_alloc_init
1996  */
1997 lck_mtx_t *
1998 lck_mtx_alloc_init(
1999         lck_grp_t       *grp,
2000         lck_attr_t      *attr)
2001 {
2002         lck_mtx_t       *lck;
2003 #ifdef  MUTEX_ZONE
2004         if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
2005                 lck_mtx_init(lck, grp, attr);
2006 #else
2007         if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
2008                 lck_mtx_init(lck, grp, attr);
2009 #endif
2010         return(lck);
2011 }
2012
2013 /*
2014  *      Routine:        lck_mtx_free
2015  */
2016 void
2017 lck_mtx_free(
2018         lck_mtx_t       *lck,
2019         lck_grp_t       *grp)
2020 {
2021         lck_mtx_destroy(lck, grp);
2022 #ifdef  MUTEX_ZONE
2023         zfree(lck_mtx_zone, lck);
2024 #else
2025         kfree(lck, sizeof(lck_mtx_t));
2026 #endif
2027 }
2028
2029 /*
2030  *      Routine:        lck_mtx_ext_init
2031  */
2032 static void
2033 lck_mtx_ext_init(
2034         lck_mtx_ext_t   *lck,
2035         lck_grp_t       *grp,
2036         lck_attr_t      *attr)
2037 {
2038         bzero((void *)lck, sizeof(lck_mtx_ext_t));
2039
2040         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2041                 lck->lck_mtx_deb.type = MUTEX_TAG;
2042                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2043         }
2044
2045         lck->lck_mtx_grp = grp;
2046
2047         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
2048                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2049
2050         lck->lck_mtx.lck_mtx_is_ext = 1;
2051         lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
2052 }
2053
2054 /*
2055  *      Routine:        lck_mtx_init
2056  */
2057 void
2058 lck_mtx_init(
2059         lck_mtx_t       *lck,
2060         lck_grp_t       *grp,
2061         lck_attr_t      *attr)
2062 {
2063         lck_mtx_ext_t   *lck_ext;
2064         lck_attr_t      *lck_attr;
2065
2066         if (attr != LCK_ATTR_NULL)
2067                 lck_attr = attr;
2068         else
2069                 lck_attr = &LockDefaultLckAttr;
2070
2071         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2072                 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2073                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
2074                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2075                         lck->lck_mtx_ptr = lck_ext;
2076                 }
2077         } else {
2078                 lck->lck_mtx_owner = 0;
2079                 lck->lck_mtx_state = 0;
2080         }
2081         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2082         lck_grp_reference(grp);
2083         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2084 }
2085
2086 /*
2087  *      Routine:        lck_mtx_init_ext
2088  */
2089 void
2090 lck_mtx_init_ext(
2091         lck_mtx_t       *lck,
2092         lck_mtx_ext_t   *lck_ext,
2093         lck_grp_t       *grp,
2094         lck_attr_t      *attr)
2095 {
2096         lck_attr_t      *lck_attr;
2097
2098         if (attr != LCK_ATTR_NULL)
2099                 lck_attr = attr;
2100         else
2101                 lck_attr = &LockDefaultLckAttr;
2102
2103         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2104                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2105                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2106                 lck->lck_mtx_ptr = lck_ext;
2107         } else {
2108                 lck->lck_mtx_owner = 0;
2109                 lck->lck_mtx_state = 0;
2110         }
2111         lck->lck_mtx_pad32 = 0xFFFFFFFF;
2112
2113         lck_grp_reference(grp);
2114         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2115 }
2116
2117 /*
2118  *      Routine:        lck_mtx_destroy
2119  */
2120 void
2121 lck_mtx_destroy(
2122         lck_mtx_t       *lck,
2123         lck_grp_t       *grp)
2124 {
2125         boolean_t lck_is_indirect;
2126
2127         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
2128                 return;
2129 #if MACH_LDEBUG
2130         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2131 #endif
2132         lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2133
2134         lck_mtx_lock_mark_destroyed(lck);
2135
2136         if (lck_is_indirect)
2137                 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2138         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2139         lck_grp_deallocate(grp);
2140         return;
2141 }
2142
2143
2144 #define LCK_MTX_LCK_WAIT_CODE           0x20
2145 #define LCK_MTX_LCK_WAKEUP_CODE         0x21
2146 #define LCK_MTX_LCK_SPIN_CODE           0x22
2147 #define LCK_MTX_LCK_ACQUIRE_CODE        0x23
2148 #define LCK_MTX_LCK_DEMOTE_CODE         0x24
2149
2150
2151 /*
2152  * Routine:     lck_mtx_unlock_wakeup_x86
2153  *
2154  * Invoked on unlock when there is
2155  * contention (i.e. the assembly routine sees that
2156  * that mutex->lck_mtx_waiters != 0 or
2157  * that mutex->lck_mtx_promoted != 0...
2158  *
2159  * neither the mutex or interlock is held
2160  */
2161 void
2162 lck_mtx_unlock_wakeup_x86 (
2163         lck_mtx_t       *mutex,
2164         int             prior_lock_state)
2165 {
2166         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2167         lck_mtx_t               fake_lck;
2168
2169         /*
2170          * prior_lock state is a snapshot of the 2nd word of the
2171          * lock in question... we'll fake up a lock with the bits
2172          * copied into place and carefully not access anything
2173          * beyond whats defined in the second word of a lck_mtx_t
2174          */
2175         fake_lck.lck_mtx_state = prior_lock_state;
2176
2177         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
2178                      trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
2179
2180         if (__probable(fake_lck.lck_mtx_waiters)) {
2181                 if (fake_lck.lck_mtx_waiters > 1)
2182                         thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
2183                 else
2184                         thread_wakeup_one(LCK_MTX_EVENT(mutex));
2185         }
2186
2187         if (__improbable(fake_lck.lck_mtx_promoted)) {
2188                 thread_t        thread = current_thread();
2189
2190
2191                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
2192                              thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
2193
2194                 if (thread->promotions > 0) {
2195                         spl_t   s = splsched();
2196
2197                         thread_lock(thread);
2198
2199                         if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
2200
2201                                 thread->sched_flags &= ~TH_SFLAG_PROMOTED;
2202
2203                                 if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
2204                                         /* Thread still has a RW lock promotion */
2205                                 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
2206                                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
2207                                                               thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
2208
2209                                         set_sched_pri(thread, DEPRESSPRI);
2210                                 }
2211                                 else {
2212                                         if (thread->base_pri < thread->sched_pri) {
2213                                                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
2214                                                                       thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
2215
2216                                                 thread_recompute_sched_pri(thread, FALSE);
2217                                         }
2218                                 }
2219                         }
2220                         thread_unlock(thread);
2221                         splx(s);
2222                 }
2223         }
2224         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
2225                      trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2226 }
2227
2228
2229 /*
2230  * Routine:     lck_mtx_lock_acquire_x86
2231  *
2232  * Invoked on acquiring the mutex when there is
2233  * contention (i.e. the assembly routine sees that
2234  * that mutex->lck_mtx_waiters != 0 or
2235  * thread->was_promoted_on_wakeup != 0)...
2236  *
2237  * mutex is owned...  interlock is held... preemption is disabled
2238  */
2239 void
2240 lck_mtx_lock_acquire_x86(
2241         lck_mtx_t       *mutex)
2242 {
2243         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2244         thread_t                thread;
2245         integer_t               priority;
2246         spl_t                   s;
2247
2248         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
2249                      trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2250
2251         if (mutex->lck_mtx_waiters)
2252                 priority = mutex->lck_mtx_pri;
2253         else
2254                 priority = 0;
2255
2256         thread = (thread_t)mutex->lck_mtx_owner;        /* faster then current_thread() */
2257
2258         if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
2259
2260                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2261                                       thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0);
2262
2263                 s = splsched();
2264                 thread_lock(thread);
2265
2266                 if (thread->sched_pri < priority) {
2267                         /* Do not promote past promotion ceiling */
2268                         assert(priority <= MAXPRI_PROMOTE);
2269                         set_sched_pri(thread, priority);
2270                 }
2271                 if (mutex->lck_mtx_promoted == 0) {
2272                         mutex->lck_mtx_promoted = 1;
2273
2274                         thread->promotions++;
2275                         thread->sched_flags |= TH_SFLAG_PROMOTED;
2276                 }
2277                 thread->was_promoted_on_wakeup = 0;
2278
2279                 thread_unlock(thread);
2280                 splx(s);
2281         }
2282         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
2283                      trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2284 }
2285
2286
2287 static int
2288 lck_mtx_interlock_try_lock(lck_mtx_t *mutex, boolean_t *istate)
2289 {
2290         int             retval;
2291
2292         *istate = ml_set_interrupts_enabled(FALSE);
2293         retval = lck_mtx_ilk_try_lock(mutex);
2294
2295         if (retval == 0)
2296                 ml_set_interrupts_enabled(*istate);
2297
2298         return retval;
2299 }
2300
2301 static void
2302 lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
2303 {
2304         lck_mtx_ilk_unlock(mutex);
2305         ml_set_interrupts_enabled(istate);
2306 }
2307
2308
2309 /*
2310  * Routine:     lck_mtx_lock_spinwait_x86
2311  *
2312  * Invoked trying to acquire a mutex when there is contention but
2313  * the holder is running on another processor. We spin for up to a maximum
2314  * time waiting for the lock to be released.
2315  *
2316  * Called with the interlock unlocked.
2317  * returns 0 if mutex acquired
2318  * returns 1 if we spun
2319  * returns 2 if we didn't spin due to the holder not running
2320  */
2321 int
2322 lck_mtx_lock_spinwait_x86(
2323         lck_mtx_t       *mutex)
2324 {
2325         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2326         thread_t        holder;
2327         uint64_t        overall_deadline;
2328         uint64_t        check_owner_deadline;
2329         uint64_t        cur_time;
2330         int             retval = 1;
2331         int             loopcount = 0;
2332
2333         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2334                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
2335
2336         cur_time = mach_absolute_time();
2337         overall_deadline = cur_time + MutexSpin;
2338         check_owner_deadline = cur_time;
2339
2340         /*
2341          * Spin while:
2342          *   - mutex is locked, and
2343          *   - its locked as a spin lock, and
2344          *   - owner is running on another processor, and
2345          *   - owner (processor) is not idling, and
2346          *   - we haven't spun for long enough.
2347          */
2348         do {
2349                 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
2350                         retval = 0;
2351                         break;
2352                 }
2353                 cur_time = mach_absolute_time();
2354
2355                 if (cur_time >= overall_deadline)
2356                         break;
2357
2358                 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
2359                         boolean_t       istate;
2360
2361                         if (lck_mtx_interlock_try_lock(mutex, &istate)) {
2362
2363                                 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
2364
2365                                         if ( !(holder->machine.specFlags & OnProc) ||
2366                                              (holder->state & TH_IDLE)) {
2367
2368                                                 lck_mtx_interlock_unlock(mutex, istate);
2369
2370                                                 if (loopcount == 0)
2371                                                         retval = 2;
2372                                                 break;
2373                                         }
2374                                 }
2375                                 lck_mtx_interlock_unlock(mutex, istate);
2376
2377                                 check_owner_deadline = cur_time + (MutexSpin / 4);
2378                         }
2379                 }
2380                 cpu_pause();
2381
2382                 loopcount++;
2383
2384         } while (TRUE);
2385
2386 #if     CONFIG_DTRACE
2387         /*
2388          * We've already kept a count via overall_deadline of how long we spun.
2389          * If dtrace is active, then we compute backwards to decide how
2390          * long we spun.
2391          *
2392          * Note that we record a different probe id depending on whether
2393          * this is a direct or indirect mutex.  This allows us to
2394          * penalize only lock groups that have debug/stats enabled
2395          * with dtrace processing if desired.
2396          */
2397         if (__probable(mutex->lck_mtx_is_ext == 0)) {
2398                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
2399                         mach_absolute_time() - (overall_deadline - MutexSpin));
2400         } else {
2401                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
2402                         mach_absolute_time() - (overall_deadline - MutexSpin));
2403         }
2404         /* The lockstat acquire event is recorded by the assembly code beneath us. */
2405 #endif
2406
2407         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2408                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
2409
2410         return retval;
2411 }
2412
2413
2414
2415 /*
2416  * Routine:     lck_mtx_lock_wait_x86
2417  *
2418  * Invoked in order to wait on contention.
2419  *
2420  * Called with the interlock locked and
2421  * preemption disabled...
2422  * returns it unlocked and with preemption enabled
2423  */
2424 void
2425 lck_mtx_lock_wait_x86 (
2426         lck_mtx_t       *mutex)
2427 {
2428         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2429         thread_t        self = current_thread();
2430         thread_t        holder;
2431         integer_t       priority;
2432         spl_t           s;
2433 #if     CONFIG_DTRACE
2434         uint64_t        sleep_start = 0;
2435
2436         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
2437                 sleep_start = mach_absolute_time();
2438         }
2439 #endif
2440         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
2441                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2442
2443         priority = self->sched_pri;
2444
2445         if (priority < self->base_pri)
2446                 priority = self->base_pri;
2447         if (priority < BASEPRI_DEFAULT)
2448                 priority = BASEPRI_DEFAULT;
2449
2450         /* Do not promote past promotion ceiling */
2451         priority = MIN(priority, MAXPRI_PROMOTE);
2452
2453         if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
2454                 mutex->lck_mtx_pri = priority;
2455         mutex->lck_mtx_waiters++;
2456
2457         if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
2458              holder->sched_pri < mutex->lck_mtx_pri ) {
2459                 s = splsched();
2460                 thread_lock(holder);
2461
2462                 /* holder priority may have been bumped by another thread
2463                  * before thread_lock was taken
2464                  */
2465                 if (holder->sched_pri < mutex->lck_mtx_pri) {
2466                         KERNEL_DEBUG_CONSTANT(
2467                                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2468                                 holder->sched_pri, priority, thread_tid(holder), trace_lck, 0);
2469                         /* Assert that we're not altering the priority of a
2470                          * thread above the MAXPRI_PROMOTE band
2471                          */
2472                         assert(holder->sched_pri < MAXPRI_PROMOTE);
2473                         set_sched_pri(holder, priority);
2474
2475                         if (mutex->lck_mtx_promoted == 0) {
2476                                 holder->promotions++;
2477                                 holder->sched_flags |= TH_SFLAG_PROMOTED;
2478
2479                                 mutex->lck_mtx_promoted = 1;
2480                         }
2481                 }
2482                 thread_unlock(holder);
2483                 splx(s);
2484         }
2485         thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
2486         assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
2487
2488         lck_mtx_ilk_unlock(mutex);
2489
2490         thread_block(THREAD_CONTINUE_NULL);
2491
2492         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2493                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2494
2495 #if     CONFIG_DTRACE
2496         /*
2497          * Record the Dtrace lockstat probe for blocking, block time
2498          * measured from when we were entered.
2499          */
2500         if (sleep_start) {
2501                 if (mutex->lck_mtx_is_ext == 0) {
2502                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2503                             mach_absolute_time() - sleep_start);
2504                 } else {
2505                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2506                             mach_absolute_time() - sleep_start);
2507                 }
2508         }
2509 #endif
2510 }
2511
2512 /*
2513  *      Routine: kdp_lck_mtx_lock_spin_is_acquired
2514  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2515  *      Returns: TRUE if lock is acquired.
2516  */
2517 boolean_t
2518 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t     *lck)
2519 {
2520         if (not_in_kdp) {
2521                 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
2522         }
2523
2524         if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
2525                 return TRUE;
2526         }
2527
2528         return FALSE;
2529 }
2530
2531 void
2532 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2533 {
2534         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
2535         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2536         thread_t holder   = (thread_t)mutex->lck_mtx_owner;
2537         waitinfo->owner   = thread_tid(holder);
2538 }
2539
2540 void
2541 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2542 {
2543         lck_rw_t *rwlck = NULL;
2544         switch(waitinfo->wait_type) {
2545                 case kThreadWaitKernelRWLockRead:
2546                         rwlck = READ_EVENT_TO_RWLOCK(event);
2547                         break;
2548                 case kThreadWaitKernelRWLockWrite:
2549                 case kThreadWaitKernelRWLockUpgrade:
2550                         rwlck = WRITE_EVENT_TO_RWLOCK(event);
2551                         break;
2552                 default:
2553                         panic("%s was called with an invalid blocking type", __FUNCTION__);
2554                         break;
2555         }
2556         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2557         waitinfo->owner = 0;
2558 }