osfmk/i386/locks_i386.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  *      File:   kern/lock.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *      Date:   1985
  60  *
  61  *      Locking primitives implementation
  62  */
  63
  64 #include <mach_ldebug.h>
  65
  66 #include <kern/locks.h>
  67 #include <kern/kalloc.h>
  68 #include <kern/misc_protos.h>
  69 #include <kern/thread.h>
  70 #include <kern/processor.h>
  71 #include <kern/cpu_data.h>
  72 #include <kern/cpu_number.h>
  73 #include <kern/sched_prim.h>
  74 #include <kern/xpr.h>
  75 #include <kern/debug.h>
  76 #include <string.h>
  77
  78 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
  79 #include <machine/machine_cpu.h>
  80 #include <i386/mp.h>
  81
  82 #include <sys/kdebug.h>
  83 #include <mach/branch_predicates.h>
  84
  85 /*
  86  * We need only enough declarations from the BSD-side to be able to
  87  * test if our probe is active, and to call __dtrace_probe().  Setting
  88  * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
  89  */
  90 #if     CONFIG_DTRACE
  91 #define NEED_DTRACE_DEFS
  92 #include <../bsd/sys/lockstat.h>
  93 #endif
  94
  95 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  96 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
  97 #define LCK_RW_LCK_SHARED_CODE          0x102
  98 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
  99 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
 100 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 101
 102 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
 103 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
 104 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
 105 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
 106 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
 107 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
 108 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
 109 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
 110
 111
 112 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 113
 114 unsigned int LcksOpts=0;
 115
 116 /* Forwards */
 117
 118 #if     USLOCK_DEBUG
 119 /*
 120  *      Perform simple lock checks.
 121  */
 122 int     uslock_check = 1;
 123 int     max_lock_loops  = 100000000;
 124 decl_simple_lock_data(extern , printf_lock)
 125 decl_simple_lock_data(extern , panic_lock)
 126 #endif  /* USLOCK_DEBUG */
 127
 128 extern unsigned int not_in_kdp;
 129 extern void kdp_lck_mtx_find_owner(
 130         struct waitq *          waitq,
 131         event64_t               event,
 132         thread_waitinfo_t *     waitinfo);
 133
 134 extern void kdp_rwlck_find_owner(
 135         struct waitq *          waitq,
 136         event64_t               event,
 137         thread_waitinfo_t *     waitinfo);
 138
 139 /*
 140  *      We often want to know the addresses of the callers
 141  *      of the various lock routines.  However, this information
 142  *      is only used for debugging and statistics.
 143  */
 144 typedef void    *pc_t;
 145 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 146 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 147 #if     ANY_LOCK_DEBUG
 148 #define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
 149 #define DECL_PC(pc)     pc_t pc;
 150 #else   /* ANY_LOCK_DEBUG */
 151 #define DECL_PC(pc)
 152 #ifdef  lint
 153 /*
 154  *      Eliminate lint complaints about unused local pc variables.
 155  */
 156 #define OBTAIN_PC(pc)   ++pc
 157 #else   /* lint */
 158 #define OBTAIN_PC(pc)
 159 #endif  /* lint */
 160 #endif  /* USLOCK_DEBUG */
 161
 162
 163 /*
 164  *      Portable lock package implementation of usimple_locks.
 165  */
 166
 167 #if     USLOCK_DEBUG
 168 #define USLDBG(stmt)    stmt
 169 void            usld_lock_init(usimple_lock_t, unsigned short);
 170 void            usld_lock_pre(usimple_lock_t, pc_t);
 171 void            usld_lock_post(usimple_lock_t, pc_t);
 172 void            usld_unlock(usimple_lock_t, pc_t);
 173 void            usld_lock_try_pre(usimple_lock_t, pc_t);
 174 void            usld_lock_try_post(usimple_lock_t, pc_t);
 175 int             usld_lock_common_checks(usimple_lock_t, char *);
 176 #else   /* USLOCK_DEBUG */
 177 #define USLDBG(stmt)
 178 #endif  /* USLOCK_DEBUG */
 179
 180
 181 extern int lck_rw_grab_want(lck_rw_t *lck);
 182 extern int lck_rw_grab_shared(lck_rw_t *lck);
 183 extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
 184
 185
 186 /*
 187  * Forward definitions
 188  */
 189
 190 void lck_rw_lock_shared_gen(
 191         lck_rw_t        *lck);
 192
 193 void lck_rw_lock_exclusive_gen(
 194         lck_rw_t        *lck);
 195
 196 boolean_t lck_rw_lock_shared_to_exclusive_success(
 197         lck_rw_t        *lck);
 198
 199 boolean_t lck_rw_lock_shared_to_exclusive_failure(
 200         lck_rw_t        *lck,
 201         int             prior_lock_state);
 202
 203 void lck_rw_lock_exclusive_to_shared_gen(
 204         lck_rw_t        *lck,
 205         int             prior_lock_state);
 206
 207 lck_rw_type_t lck_rw_done_gen(
 208         lck_rw_t        *lck,
 209         int             prior_lock_state);
 210
 211 void lck_rw_clear_promotions_x86(thread_t thread);
 212
 213 /*
 214  *      Routine:        lck_spin_alloc_init
 215  */
 216 lck_spin_t *
 217 lck_spin_alloc_init(
 218         lck_grp_t       *grp,
 219         lck_attr_t      *attr)
 220 {
 221         lck_spin_t      *lck;
 222
 223         if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
 224                 lck_spin_init(lck, grp, attr);
 225
 226         return(lck);
 227 }
 228
 229 /*
 230  *      Routine:        lck_spin_free
 231  */
 232 void
 233 lck_spin_free(
 234         lck_spin_t      *lck,
 235         lck_grp_t       *grp)
 236 {
 237         lck_spin_destroy(lck, grp);
 238         kfree(lck, sizeof(lck_spin_t));
 239 }
 240
 241 /*
 242  *      Routine:        lck_spin_init
 243  */
 244 void
 245 lck_spin_init(
 246         lck_spin_t      *lck,
 247         lck_grp_t       *grp,
 248         __unused lck_attr_t     *attr)
 249 {
 250         usimple_lock_init((usimple_lock_t) lck, 0);
 251         lck_grp_reference(grp);
 252         lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 253 }
 254
 255 /*
 256  *      Routine:        lck_spin_destroy
 257  */
 258 void
 259 lck_spin_destroy(
 260         lck_spin_t      *lck,
 261         lck_grp_t       *grp)
 262 {
 263         if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
 264                 return;
 265         lck->interlock = LCK_SPIN_TAG_DESTROYED;
 266         lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 267         lck_grp_deallocate(grp);
 268         return;
 269 }
 270
 271 /*
 272  *      Routine:        lck_spin_lock
 273  */
 274 void
 275 lck_spin_lock(
 276         lck_spin_t      *lck)
 277 {
 278         usimple_lock((usimple_lock_t) lck);
 279 }
 280
 281 /*
 282  *      Routine:        lck_spin_unlock
 283  */
 284 void
 285 lck_spin_unlock(
 286         lck_spin_t      *lck)
 287 {
 288         usimple_unlock((usimple_lock_t) lck);
 289 }
 290
 291
 292 /*
 293  *      Routine:        lck_spin_try_lock
 294  */
 295 boolean_t
 296 lck_spin_try_lock(
 297         lck_spin_t      *lck)
 298 {
 299         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
 300 #if     DEVELOPMENT || DEBUG
 301         if (lrval) {
 302                 pltrace(FALSE);
 303         }
 304 #endif
 305         return(lrval);
 306 }
 307
 308 /*
 309  *      Routine:        lck_spin_assert
 310  */
 311 void
 312 lck_spin_assert(lck_spin_t *lock, unsigned int type)
 313 {
 314         thread_t thread, holder;
 315         uintptr_t state;
 316
 317         if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
 318                 panic("lck_spin_assert(): invalid arg (%u)", type);
 319         }
 320
 321         state = lock->interlock;
 322         holder = (thread_t)state;
 323         thread = current_thread();
 324         if (type == LCK_ASSERT_OWNED) {
 325                 if (__improbable(holder == THREAD_NULL)) {
 326                         panic("Lock not owned %p = %lx", lock, state);
 327                 }
 328                 if (__improbable(holder != thread)) {
 329                         panic("Lock not owned by current thread %p = %lx", lock, state);
 330                 }
 331         } else if (type == LCK_ASSERT_NOTOWNED) {
 332                 if (__improbable(holder != THREAD_NULL)) {
 333                         if (holder == thread) {
 334                                 panic("Lock owned by current thread %p = %lx", lock, state);
 335                         } else {
 336                                 panic("Lock %p owned by thread %p", lock, holder);
 337                         }
 338                 }
 339         }
 340 }
 341
 342 /*
 343  *      Routine: kdp_lck_spin_is_acquired
 344  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 345  *      Returns: TRUE if lock is acquired.
 346  */
 347 boolean_t
 348 kdp_lck_spin_is_acquired(lck_spin_t *lck) {
 349         if (not_in_kdp) {
 350                 panic("panic: spinlock acquired check done outside of kernel debugger");
 351         }
 352         return (lck->interlock != 0)? TRUE : FALSE;
 353 }
 354
 355 /*
 356  *      Initialize a usimple_lock.
 357  *
 358  *      No change in preemption state.
 359  */
 360 void
 361 usimple_lock_init(
 362         usimple_lock_t  l,
 363         __unused unsigned short tag)
 364 {
 365 #ifndef MACHINE_SIMPLE_LOCK
 366         USLDBG(usld_lock_init(l, tag));
 367         hw_lock_init(&l->interlock);
 368 #else
 369         simple_lock_init((simple_lock_t)l,tag);
 370 #endif
 371 }
 372
 373 volatile uint32_t spinlock_owner_cpu = ~0;
 374 volatile usimple_lock_t spinlock_timed_out;
 375
 376 uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
 377         uint64_t deadline;
 378         uint32_t i;
 379
 380         for (i = 0; i < real_ncpus; i++) {
 381                 if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
 382                         spinlock_owner_cpu = i;
 383                         if ((uint32_t) cpu_number() == i)
 384                                 break;
 385                         cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
 386                         cpu_NMI_interrupt(i);
 387                         deadline = mach_absolute_time() + (LockTimeOut * 2);
 388                         while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
 389                                 cpu_pause();
 390                         break;
 391                 }
 392         }
 393
 394         return spinlock_owner_cpu;
 395 }
 396
 397 /*
 398  *      Acquire a usimple_lock.
 399  *
 400  *      Returns with preemption disabled.  Note
 401  *      that the hw_lock routines are responsible for
 402  *      maintaining preemption state.
 403  */
 404 void
 405 usimple_lock(
 406         usimple_lock_t  l)
 407 {
 408 #ifndef MACHINE_SIMPLE_LOCK
 409         DECL_PC(pc);
 410
 411         OBTAIN_PC(pc);
 412         USLDBG(usld_lock_pre(l, pc));
 413
 414         if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))        {
 415                 boolean_t uslock_acquired = FALSE;
 416                 while (machine_timeout_suspended()) {
 417                         enable_preemption();
 418                         if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
 419                                 break;
 420                 }
 421
 422                 if (uslock_acquired == FALSE) {
 423                         uint32_t lock_cpu;
 424                         uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
 425                         spinlock_timed_out = l;
 426                         lock_cpu = spinlock_timeout_NMI(lowner);
 427                         panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
 428                 }
 429         }
 430 #if DEVELOPMENT || DEBUG
 431                 pltrace(FALSE);
 432 #endif
 433
 434         USLDBG(usld_lock_post(l, pc));
 435 #else
 436         simple_lock((simple_lock_t)l);
 437 #endif
 438 }
 439
 440
 441 /*
 442  *      Release a usimple_lock.
 443  *
 444  *      Returns with preemption enabled.  Note
 445  *      that the hw_lock routines are responsible for
 446  *      maintaining preemption state.
 447  */
 448 void
 449 usimple_unlock(
 450         usimple_lock_t  l)
 451 {
 452 #ifndef MACHINE_SIMPLE_LOCK
 453         DECL_PC(pc);
 454
 455         OBTAIN_PC(pc);
 456         USLDBG(usld_unlock(l, pc));
 457 #if DEVELOPMENT || DEBUG
 458                 pltrace(TRUE);
 459 #endif
 460         hw_lock_unlock(&l->interlock);
 461 #else
 462         simple_unlock_rwmb((simple_lock_t)l);
 463 #endif
 464 }
 465
 466
 467 /*
 468  *      Conditionally acquire a usimple_lock.
 469  *
 470  *      On success, returns with preemption disabled.
 471  *      On failure, returns with preemption in the same state
 472  *      as when first invoked.  Note that the hw_lock routines
 473  *      are responsible for maintaining preemption state.
 474  *
 475  *      XXX No stats are gathered on a miss; I preserved this
 476  *      behavior from the original assembly-language code, but
 477  *      doesn't it make sense to log misses?  XXX
 478  */
 479 unsigned int
 480 usimple_lock_try(
 481         usimple_lock_t  l)
 482 {
 483 #ifndef MACHINE_SIMPLE_LOCK
 484         unsigned int    success;
 485         DECL_PC(pc);
 486
 487         OBTAIN_PC(pc);
 488         USLDBG(usld_lock_try_pre(l, pc));
 489         if ((success = hw_lock_try(&l->interlock))) {
 490 #if DEVELOPMENT || DEBUG
 491                 pltrace(FALSE);
 492 #endif
 493         USLDBG(usld_lock_try_post(l, pc));
 494         }
 495         return success;
 496 #else
 497         return(simple_lock_try((simple_lock_t)l));
 498 #endif
 499 }
 500
 501 /*
 502  * Acquire a usimple_lock while polling for pending TLB flushes
 503  * and spinning on a lock.
 504  *
 505  */
 506 void
 507 usimple_lock_try_lock_loop(usimple_lock_t l)
 508 {
 509         boolean_t istate = ml_get_interrupts_enabled();
 510         while (!simple_lock_try((l))) {
 511                 if (!istate)
 512                         handle_pending_TLB_flushes();
 513                 cpu_pause();
 514         }
 515 }
 516
 517 #if     USLOCK_DEBUG
 518 /*
 519  *      States of a usimple_lock.  The default when initializing
 520  *      a usimple_lock is setting it up for debug checking.
 521  */
 522 #define USLOCK_CHECKED          0x0001          /* lock is being checked */
 523 #define USLOCK_TAKEN            0x0002          /* lock has been taken */
 524 #define USLOCK_INIT             0xBAA0          /* lock has been initialized */
 525 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 526 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 527                                  ((l)->debug.state & USLOCK_CHECKED))
 528
 529 /*
 530  *      Trace activities of a particularly interesting lock.
 531  */
 532 void    usl_trace(usimple_lock_t, int, pc_t, const char *);
 533
 534
 535 /*
 536  *      Initialize the debugging information contained
 537  *      in a usimple_lock.
 538  */
 539 void
 540 usld_lock_init(
 541         usimple_lock_t  l,
 542         __unused unsigned short tag)
 543 {
 544         if (l == USIMPLE_LOCK_NULL)
 545                 panic("lock initialization:  null lock pointer");
 546         l->lock_type = USLOCK_TAG;
 547         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 548         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 549         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 550         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 551         l->debug.duration[0] = l->debug.duration[1] = 0;
 552         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 553         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 554         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 555 }
 556
 557
 558 /*
 559  *      These checks apply to all usimple_locks, not just
 560  *      those with USLOCK_CHECKED turned on.
 561  */
 562 int
 563 usld_lock_common_checks(
 564         usimple_lock_t  l,
 565         char            *caller)
 566 {
 567         if (l == USIMPLE_LOCK_NULL)
 568                 panic("%s:  null lock pointer", caller);
 569         if (l->lock_type != USLOCK_TAG)
 570                 panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
 571         if (!(l->debug.state & USLOCK_INIT))
 572                 panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
 573         return USLOCK_CHECKING(l);
 574 }
 575
 576
 577 /*
 578  *      Debug checks on a usimple_lock just before attempting
 579  *      to acquire it.
 580  */
 581 /* ARGSUSED */
 582 void
 583 usld_lock_pre(
 584         usimple_lock_t  l,
 585         pc_t            pc)
 586 {
 587         char    caller[] = "usimple_lock";
 588
 589
 590         if (!usld_lock_common_checks(l, caller))
 591                 return;
 592
 593 /*
 594  *      Note that we have a weird case where we are getting a lock when we are]
 595  *      in the process of putting the system to sleep. We are running with no
 596  *      current threads, therefore we can't tell if we are trying to retake a lock
 597  *      we have or someone on the other processor has it.  Therefore we just
 598  *      ignore this test if the locking thread is 0.
 599  */
 600
 601         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 602             l->debug.lock_thread == (void *) current_thread()) {
 603                 printf("%s:  lock %p already locked (at %p) by",
 604                       caller, l, l->debug.lock_pc);
 605                 printf(" current thread %p (new attempt at pc %p)\n",
 606                        l->debug.lock_thread, pc);
 607                 panic("%s", caller);
 608         }
 609         mp_disable_preemption();
 610         usl_trace(l, cpu_number(), pc, caller);
 611         mp_enable_preemption();
 612 }
 613
 614
 615 /*
 616  *      Debug checks on a usimple_lock just after acquiring it.
 617  *
 618  *      Pre-emption has been disabled at this point,
 619  *      so we are safe in using cpu_number.
 620  */
 621 void
 622 usld_lock_post(
 623         usimple_lock_t  l,
 624         pc_t            pc)
 625 {
 626         int     mycpu;
 627         char    caller[] = "successful usimple_lock";
 628
 629
 630         if (!usld_lock_common_checks(l, caller))
 631                 return;
 632
 633         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 634                 panic("%s:  lock %p became uninitialized",
 635                       caller, l);
 636         if ((l->debug.state & USLOCK_TAKEN))
 637                 panic("%s:  lock 0x%p became TAKEN by someone else",
 638                       caller, l);
 639
 640         mycpu = cpu_number();
 641         l->debug.lock_thread = (void *)current_thread();
 642         l->debug.state |= USLOCK_TAKEN;
 643         l->debug.lock_pc = pc;
 644         l->debug.lock_cpu = mycpu;
 645
 646         usl_trace(l, mycpu, pc, caller);
 647 }
 648
 649
 650 /*
 651  *      Debug checks on a usimple_lock just before
 652  *      releasing it.  Note that the caller has not
 653  *      yet released the hardware lock.
 654  *
 655  *      Preemption is still disabled, so there's
 656  *      no problem using cpu_number.
 657  */
 658 void
 659 usld_unlock(
 660         usimple_lock_t  l,
 661         pc_t            pc)
 662 {
 663         int     mycpu;
 664         char    caller[] = "usimple_unlock";
 665
 666
 667         if (!usld_lock_common_checks(l, caller))
 668                 return;
 669
 670         mycpu = cpu_number();
 671
 672         if (!(l->debug.state & USLOCK_TAKEN))
 673                 panic("%s:  lock 0x%p hasn't been taken",
 674                       caller, l);
 675         if (l->debug.lock_thread != (void *) current_thread())
 676                 panic("%s:  unlocking lock 0x%p, owned by thread %p",
 677                       caller, l, l->debug.lock_thread);
 678         if (l->debug.lock_cpu != mycpu) {
 679                 printf("%s:  unlocking lock 0x%p on cpu 0x%x",
 680                        caller, l, mycpu);
 681                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 682                 panic("%s", caller);
 683         }
 684         usl_trace(l, mycpu, pc, caller);
 685
 686         l->debug.unlock_thread = l->debug.lock_thread;
 687         l->debug.lock_thread = INVALID_PC;
 688         l->debug.state &= ~USLOCK_TAKEN;
 689         l->debug.unlock_pc = pc;
 690         l->debug.unlock_cpu = mycpu;
 691 }
 692
 693
 694 /*
 695  *      Debug checks on a usimple_lock just before
 696  *      attempting to acquire it.
 697  *
 698  *      Preemption isn't guaranteed to be disabled.
 699  */
 700 void
 701 usld_lock_try_pre(
 702         usimple_lock_t  l,
 703         pc_t            pc)
 704 {
 705         char    caller[] = "usimple_lock_try";
 706
 707         if (!usld_lock_common_checks(l, caller))
 708                 return;
 709         mp_disable_preemption();
 710         usl_trace(l, cpu_number(), pc, caller);
 711         mp_enable_preemption();
 712 }
 713
 714
 715 /*
 716  *      Debug checks on a usimple_lock just after
 717  *      successfully attempting to acquire it.
 718  *
 719  *      Preemption has been disabled by the
 720  *      lock acquisition attempt, so it's safe
 721  *      to use cpu_number.
 722  */
 723 void
 724 usld_lock_try_post(
 725         usimple_lock_t  l,
 726         pc_t            pc)
 727 {
 728         int     mycpu;
 729         char    caller[] = "successful usimple_lock_try";
 730
 731         if (!usld_lock_common_checks(l, caller))
 732                 return;
 733
 734         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 735                 panic("%s:  lock 0x%p became uninitialized",
 736                       caller, l);
 737         if ((l->debug.state & USLOCK_TAKEN))
 738                 panic("%s:  lock 0x%p became TAKEN by someone else",
 739                       caller, l);
 740
 741         mycpu = cpu_number();
 742         l->debug.lock_thread = (void *) current_thread();
 743         l->debug.state |= USLOCK_TAKEN;
 744         l->debug.lock_pc = pc;
 745         l->debug.lock_cpu = mycpu;
 746
 747         usl_trace(l, mycpu, pc, caller);
 748 }
 749
 750
 751 /*
 752  *      For very special cases, set traced_lock to point to a
 753  *      specific lock of interest.  The result is a series of
 754  *      XPRs showing lock operations on that lock.  The lock_seq
 755  *      value is used to show the order of those operations.
 756  */
 757 usimple_lock_t          traced_lock;
 758 unsigned int            lock_seq;
 759
 760 void
 761 usl_trace(
 762         usimple_lock_t  l,
 763         int             mycpu,
 764         pc_t            pc,
 765         const char *    op_name)
 766 {
 767         if (traced_lock == l) {
 768                 XPR(XPR_SLOCK,
 769                     "seq %d, cpu %d, %s @ %x\n",
 770                     (uintptr_t) lock_seq, (uintptr_t) mycpu,
 771                     (uintptr_t) op_name, (uintptr_t) pc, 0);
 772                 lock_seq++;
 773         }
 774 }
 775
 776
 777 #endif  /* USLOCK_DEBUG */
 778
 779 /*
 780  *      Routine:        lck_rw_alloc_init
 781  */
 782 lck_rw_t *
 783 lck_rw_alloc_init(
 784         lck_grp_t       *grp,
 785         lck_attr_t      *attr) {
 786         lck_rw_t        *lck;
 787
 788         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 789                 bzero(lck, sizeof(lck_rw_t));
 790                 lck_rw_init(lck, grp, attr);
 791         }
 792
 793         return(lck);
 794 }
 795
 796 /*
 797  *      Routine:        lck_rw_free
 798  */
 799 void
 800 lck_rw_free(
 801         lck_rw_t        *lck,
 802         lck_grp_t       *grp) {
 803         lck_rw_destroy(lck, grp);
 804         kfree(lck, sizeof(lck_rw_t));
 805 }
 806
 807 /*
 808  *      Routine:        lck_rw_init
 809  */
 810 void
 811 lck_rw_init(
 812         lck_rw_t        *lck,
 813         lck_grp_t       *grp,
 814         lck_attr_t      *attr)
 815 {
 816         lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
 817                                         attr : &LockDefaultLckAttr;
 818
 819         hw_lock_byte_init(&lck->lck_rw_interlock);
 820         lck->lck_rw_want_write = FALSE;
 821         lck->lck_rw_want_upgrade = FALSE;
 822         lck->lck_rw_shared_count = 0;
 823         lck->lck_rw_can_sleep = TRUE;
 824         lck->lck_r_waiting = lck->lck_w_waiting = 0;
 825         lck->lck_rw_tag = 0;
 826         lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
 827                                 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
 828
 829         lck_grp_reference(grp);
 830         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 831 }
 832
 833 /*
 834  *      Routine:        lck_rw_destroy
 835  */
 836 void
 837 lck_rw_destroy(
 838         lck_rw_t        *lck,
 839         lck_grp_t       *grp)
 840 {
 841         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
 842                 return;
 843 #if MACH_LDEBUG
 844         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 845 #endif
 846         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 847         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 848         lck_grp_deallocate(grp);
 849         return;
 850 }
 851
 852 /*
 853  *      Sleep locks.  These use the same data structure and algorithm
 854  *      as the spin locks, but the process sleeps while it is waiting
 855  *      for the lock.  These work on uniprocessor systems.
 856  */
 857
 858 #define DECREMENTER_TIMEOUT 1000000
 859
 860 /*
 861  * We disable interrupts while holding the RW interlock to prevent an
 862  * interrupt from exacerbating hold time.
 863  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 864  */
 865 static boolean_t
 866 lck_interlock_lock(lck_rw_t *lck)
 867 {
 868         boolean_t       istate;
 869
 870         istate = ml_set_interrupts_enabled(FALSE);
 871         hw_lock_byte_lock(&lck->lck_rw_interlock);
 872
 873         return istate;
 874 }
 875
 876 static void
 877 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 878 {
 879         hw_lock_byte_unlock(&lck->lck_rw_interlock);
 880         ml_set_interrupts_enabled(istate);
 881 }
 882
 883 /*
 884  * This inline is used when busy-waiting for an rw lock.
 885  * If interrupts were disabled when the lock primitive was called,
 886  * we poll the IPI handler for pending tlb flushes.
 887  * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
 888  */
 889 static inline void
 890 lck_rw_lock_pause(boolean_t interrupts_enabled)
 891 {
 892         if (!interrupts_enabled)
 893                 handle_pending_TLB_flushes();
 894         cpu_pause();
 895 }
 896
 897
 898 /*
 899  * compute the deadline to spin against when
 900  * waiting for a change of state on a lck_rw_t
 901  */
 902 static inline uint64_t
 903 lck_rw_deadline_for_spin(lck_rw_t *lck)
 904 {
 905         if (lck->lck_rw_can_sleep) {
 906                 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
 907                         /*
 908                          * there are already threads waiting on this lock... this
 909                          * implies that they have spun beyond their deadlines waiting for
 910                          * the desired state to show up so we will not bother spinning at this time...
 911                          *   or
 912                          * the current number of threads sharing this lock exceeds our capacity to run them
 913                          * concurrently and since all states we're going to spin for require the rw_shared_count
 914                          * to be at 0, we'll not bother spinning since the latency for this to happen is
 915                          * unpredictable...
 916                          */
 917                         return (mach_absolute_time());
 918                 }
 919                 return (mach_absolute_time() + MutexSpin);
 920         } else
 921                 return (mach_absolute_time() + (100000LL * 1000000000LL));
 922 }
 923
 924
 925 /*
 926  *      Routine:        lck_rw_lock_exclusive
 927  */
 928 void
 929 lck_rw_lock_exclusive_gen(
 930         lck_rw_t        *lck)
 931 {
 932         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
 933         uint64_t        deadline = 0;
 934         int             slept = 0;
 935         int             gotlock = 0;
 936         int             lockheld = 0;
 937         wait_result_t   res = 0;
 938         boolean_t       istate = -1;
 939
 940 #if     CONFIG_DTRACE
 941         boolean_t dtrace_ls_initialized = FALSE;
 942         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
 943         uint64_t wait_interval = 0;
 944         int readers_at_sleep = 0;
 945 #endif
 946
 947         /*
 948          *      Try to acquire the lck_rw_want_write bit.
 949          */
 950         while ( !lck_rw_grab_want(lck)) {
 951
 952 #if     CONFIG_DTRACE
 953                 if (dtrace_ls_initialized == FALSE) {
 954                         dtrace_ls_initialized = TRUE;
 955                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
 956                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
 957                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
 958                         if (dtrace_ls_enabled) {
 959                                 /*
 960                                  * Either sleeping or spinning is happening,
 961                                  *  start a timing of our delay interval now.
 962                                  */
 963                                 readers_at_sleep = lck->lck_rw_shared_count;
 964                                 wait_interval = mach_absolute_time();
 965                         }
 966                 }
 967 #endif
 968                 if (istate == -1)
 969                         istate = ml_get_interrupts_enabled();
 970
 971                 deadline = lck_rw_deadline_for_spin(lck);
 972
 973                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
 974
 975                 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
 976                         lck_rw_lock_pause(istate);
 977
 978                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
 979
 980                 if (gotlock)
 981                         break;
 982                 /*
 983                  * if we get here, the deadline has expired w/o us
 984                  * being able to grab the lock exclusively
 985                  * check to see if we're allowed to do a thread_block
 986                  */
 987                 if (lck->lck_rw_can_sleep) {
 988
 989                         istate = lck_interlock_lock(lck);
 990
 991                         if (lck->lck_rw_want_write) {
 992
 993                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
 994
 995                                 lck->lck_w_waiting = TRUE;
 996
 997                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
 998                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
 999                                 lck_interlock_unlock(lck, istate);
1000
1001                                 if (res == THREAD_WAITING) {
1002                                         res = thread_block(THREAD_CONTINUE_NULL);
1003                                         slept++;
1004                                 }
1005                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1006                         } else {
1007                                 lck->lck_rw_want_write = TRUE;
1008                                 lck_interlock_unlock(lck, istate);
1009                                 break;
1010                         }
1011                 }
1012         }
1013         /*
1014          * Wait for readers (and upgrades) to finish...
1015          * the test for these conditions must be done simultaneously with
1016          * a check of the interlock not being held since
1017          * the rw_shared_count will drop to 0 first and then want_upgrade
1018          * will be set to 1 in the shared_to_exclusive scenario... those
1019          * adjustments are done behind the interlock and represent an
1020          * atomic change in state and must be considered as such
1021          * however, once we see the read count at 0, the want_upgrade not set
1022          * and the interlock not held, we are safe to proceed
1023          */
1024         while (lck_rw_held_read_or_upgrade(lck)) {
1025
1026 #if     CONFIG_DTRACE
1027                 /*
1028                  * Either sleeping or spinning is happening, start
1029                  * a timing of our delay interval now.  If we set it
1030                  * to -1 we don't have accurate data so we cannot later
1031                  * decide to record a dtrace spin or sleep event.
1032                  */
1033                 if (dtrace_ls_initialized == FALSE) {
1034                         dtrace_ls_initialized = TRUE;
1035                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1036                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1037                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1038                         if (dtrace_ls_enabled) {
1039                                 /*
1040                                  * Either sleeping or spinning is happening,
1041                                  *  start a timing of our delay interval now.
1042                                  */
1043                                 readers_at_sleep = lck->lck_rw_shared_count;
1044                                 wait_interval = mach_absolute_time();
1045                         }
1046                 }
1047 #endif
1048                 if (istate == -1)
1049                         istate = ml_get_interrupts_enabled();
1050
1051                 deadline = lck_rw_deadline_for_spin(lck);
1052
1053                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1054
1055                 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1056                         lck_rw_lock_pause(istate);
1057
1058                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1059
1060                 if ( !lockheld)
1061                         break;
1062                 /*
1063                  * if we get here, the deadline has expired w/o us
1064                  * being able to grab the lock exclusively
1065                  * check to see if we're allowed to do a thread_block
1066                  */
1067                 if (lck->lck_rw_can_sleep) {
1068
1069                         istate = lck_interlock_lock(lck);
1070
1071                         if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1072                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1073
1074                                 lck->lck_w_waiting = TRUE;
1075
1076                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1077                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1078                                 lck_interlock_unlock(lck, istate);
1079
1080                                 if (res == THREAD_WAITING) {
1081                                         res = thread_block(THREAD_CONTINUE_NULL);
1082                                         slept++;
1083                                 }
1084                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1085                         } else {
1086                                 lck_interlock_unlock(lck, istate);
1087                                 /*
1088                                  * must own the lock now, since we checked for
1089                                  * readers or upgrade owner behind the interlock
1090                                  * no need for a call to 'lck_rw_held_read_or_upgrade'
1091                                  */
1092                                 break;
1093                         }
1094                 }
1095         }
1096
1097 #if     CONFIG_DTRACE
1098         /*
1099          * Decide what latencies we suffered that are Dtrace events.
1100          * If we have set wait_interval, then we either spun or slept.
1101          * At least we get out from under the interlock before we record
1102          * which is the best we can do here to minimize the impact
1103          * of the tracing.
1104          * If we have set wait_interval to -1, then dtrace was not enabled when we
1105          * started sleeping/spinning so we don't record this event.
1106          */
1107         if (dtrace_ls_enabled == TRUE) {
1108                 if (slept == 0) {
1109                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1110                             mach_absolute_time() - wait_interval, 1);
1111                 } else {
1112                         /*
1113                          * For the blocking case, we also record if when we blocked
1114                          * it was held for read or write, and how many readers.
1115                          * Notice that above we recorded this before we dropped
1116                          * the interlock so the count is accurate.
1117                          */
1118                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1119                             mach_absolute_time() - wait_interval, 1,
1120                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1121                 }
1122         }
1123         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1124 #endif
1125 }
1126
1127
1128 /*
1129  *      Routine:        lck_rw_done_gen
1130  *
1131  *      called from the assembly language wrapper...
1132  *      prior_lock_state is the value in the 1st
1133  *      word of the lock at the time of a successful
1134  *      atomic compare and exchange with the new value...
1135  *      it represents the state of the lock before we
1136  *      decremented the rw_shared_count or cleared either
1137  *      rw_want_upgrade or rw_want_write and
1138  *      the lck_x_waiting bits...  since the wrapper
1139  *      routine has already changed the state atomically,
1140  *      we just need to decide if we should
1141  *      wake up anyone and what value to return... we do
1142  *      this by examining the state of the lock before
1143  *      we changed it
1144  */
1145 lck_rw_type_t
1146 lck_rw_done_gen(
1147         lck_rw_t        *lck,
1148         int             prior_lock_state)
1149 {
1150         lck_rw_t        *fake_lck;
1151         lck_rw_type_t   lock_type;
1152         thread_t        thread;
1153         uint32_t        rwlock_count;
1154
1155         /*
1156          * prior_lock state is a snapshot of the 1st word of the
1157          * lock in question... we'll fake up a pointer to it
1158          * and carefully not access anything beyond whats defined
1159          * in the first word of a lck_rw_t
1160          */
1161         fake_lck = (lck_rw_t *)&prior_lock_state;
1162
1163         if (fake_lck->lck_rw_shared_count <= 1) {
1164                 if (fake_lck->lck_w_waiting)
1165                         thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1166
1167                 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1168                         thread_wakeup(RW_LOCK_READER_EVENT(lck));
1169         }
1170         if (fake_lck->lck_rw_shared_count)
1171                 lock_type = LCK_RW_TYPE_SHARED;
1172         else
1173                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1174
1175         /* Check if dropping the lock means that we need to unpromote */
1176         thread = current_thread();
1177         rwlock_count = thread->rwlock_count--;
1178 #if MACH_LDEBUG
1179         if (rwlock_count == 0) {
1180                 panic("rw lock count underflow for thread %p", thread);
1181         }
1182 #endif
1183         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1184                 /* sched_flags checked without lock, but will be rechecked while clearing */
1185                 lck_rw_clear_promotion(thread);
1186         }
1187
1188 #if CONFIG_DTRACE
1189         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1190 #endif
1191
1192         return(lock_type);
1193 }
1194
1195
1196 /*
1197  *      Routine:        lck_rw_unlock
1198  */
1199 void
1200 lck_rw_unlock(
1201         lck_rw_t        *lck,
1202         lck_rw_type_t   lck_rw_type)
1203 {
1204         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1205                 lck_rw_unlock_shared(lck);
1206         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1207                 lck_rw_unlock_exclusive(lck);
1208         else
1209                 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1210 }
1211
1212
1213 /*
1214  *      Routine:        lck_rw_unlock_shared
1215  */
1216 void
1217 lck_rw_unlock_shared(
1218         lck_rw_t        *lck)
1219 {
1220         lck_rw_type_t   ret;
1221
1222         ret = lck_rw_done(lck);
1223
1224         if (ret != LCK_RW_TYPE_SHARED)
1225                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1226 }
1227
1228
1229 /*
1230  *      Routine:        lck_rw_unlock_exclusive
1231  */
1232 void
1233 lck_rw_unlock_exclusive(
1234         lck_rw_t        *lck)
1235 {
1236         lck_rw_type_t   ret;
1237
1238         ret = lck_rw_done(lck);
1239
1240         if (ret != LCK_RW_TYPE_EXCLUSIVE)
1241                 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1242 }
1243
1244
1245 /*
1246  *      Routine:        lck_rw_lock
1247  */
1248 void
1249 lck_rw_lock(
1250         lck_rw_t        *lck,
1251         lck_rw_type_t   lck_rw_type)
1252 {
1253         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1254                 lck_rw_lock_shared(lck);
1255         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1256                 lck_rw_lock_exclusive(lck);
1257         else
1258                 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1259 }
1260
1261
1262 /*
1263  *      Routine:        lck_rw_lock_shared_gen
1264  *      Function:
1265  *              assembly fast path code has determined that this lock
1266  *              is held exclusively... this is where we spin/block
1267  *              until we can acquire the lock in the shared mode
1268  */
1269 void
1270 lck_rw_lock_shared_gen(
1271         lck_rw_t        *lck)
1272 {
1273         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1274         uint64_t        deadline = 0;
1275         int             gotlock = 0;
1276         int             slept = 0;
1277         wait_result_t   res = 0;
1278         boolean_t       istate = -1;
1279
1280 #if     CONFIG_DTRACE
1281         uint64_t wait_interval = 0;
1282         int readers_at_sleep = 0;
1283         boolean_t dtrace_ls_initialized = FALSE;
1284         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1285 #endif
1286
1287         while ( !lck_rw_grab_shared(lck)) {
1288
1289 #if     CONFIG_DTRACE
1290                 if (dtrace_ls_initialized == FALSE) {
1291                         dtrace_ls_initialized = TRUE;
1292                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1293                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1294                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1295                         if (dtrace_ls_enabled) {
1296                                 /*
1297                                  * Either sleeping or spinning is happening,
1298                                  *  start a timing of our delay interval now.
1299                                  */
1300                                 readers_at_sleep = lck->lck_rw_shared_count;
1301                                 wait_interval = mach_absolute_time();
1302                         }
1303                 }
1304 #endif
1305                 if (istate == -1)
1306                         istate = ml_get_interrupts_enabled();
1307
1308                 deadline = lck_rw_deadline_for_spin(lck);
1309
1310                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1311                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1312
1313                 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1314                         lck_rw_lock_pause(istate);
1315
1316                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1317                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1318
1319                 if (gotlock)
1320                         break;
1321                 /*
1322                  * if we get here, the deadline has expired w/o us
1323                  * being able to grab the lock for read
1324                  * check to see if we're allowed to do a thread_block
1325                  */
1326                 if (lck->lck_rw_can_sleep) {
1327
1328                         istate = lck_interlock_lock(lck);
1329
1330                         if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1331                             ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1332
1333                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1334                                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1335
1336                                 lck->lck_r_waiting = TRUE;
1337
1338                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1339                                 res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1340                                 lck_interlock_unlock(lck, istate);
1341
1342                                 if (res == THREAD_WAITING) {
1343                                         res = thread_block(THREAD_CONTINUE_NULL);
1344                                         slept++;
1345                                 }
1346                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1347                                              trace_lck, res, slept, 0, 0);
1348                         } else {
1349                                 lck->lck_rw_shared_count++;
1350                                 lck_interlock_unlock(lck, istate);
1351                                 break;
1352                         }
1353                 }
1354         }
1355
1356 #if     CONFIG_DTRACE
1357         if (dtrace_ls_enabled == TRUE) {
1358                 if (slept == 0) {
1359                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1360                 } else {
1361                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1362                             mach_absolute_time() - wait_interval, 0,
1363                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1364                 }
1365         }
1366         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1367 #endif
1368 }
1369
1370
1371 /*
1372  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1373  *      Function:
1374  *              assembly fast path code has already dropped our read
1375  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1376  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1377  *              all we need to do here is determine if a wakeup is needed
1378  */
1379 boolean_t
1380 lck_rw_lock_shared_to_exclusive_failure(
1381         lck_rw_t        *lck,
1382         int             prior_lock_state)
1383 {
1384         lck_rw_t        *fake_lck;
1385         thread_t        thread = current_thread();
1386         uint32_t        rwlock_count;
1387
1388         /* Check if dropping the lock means that we need to unpromote */
1389         rwlock_count = thread->rwlock_count--;
1390 #if MACH_LDEBUG
1391         if (rwlock_count == 0) {
1392                 panic("rw lock count underflow for thread %p", thread);
1393         }
1394 #endif
1395         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1396                 /* sched_flags checked without lock, but will be rechecked while clearing */
1397                 lck_rw_clear_promotion(thread);
1398         }
1399
1400         /*
1401          * prior_lock state is a snapshot of the 1st word of the
1402          * lock in question... we'll fake up a pointer to it
1403          * and carefully not access anything beyond whats defined
1404          * in the first word of a lck_rw_t
1405          */
1406         fake_lck = (lck_rw_t *)&prior_lock_state;
1407
1408         if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1409                 /*
1410                  *      Someone else has requested upgrade.
1411                  *      Since we've released the read lock, wake
1412                  *      him up if he's blocked waiting
1413                  */
1414                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1415         }
1416         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1417                      VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1418
1419         return (FALSE);
1420 }
1421
1422
1423 /*
1424  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1425  *      Function:
1426  *              assembly fast path code has already dropped our read
1427  *              count and successfully acquired 'lck_rw_want_upgrade'
1428  *              we just need to wait for the rest of the readers to drain
1429  *              and then we can return as the exclusive holder of this lock
1430  */
1431 boolean_t
1432 lck_rw_lock_shared_to_exclusive_success(
1433         lck_rw_t        *lck)
1434 {
1435         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1436         uint64_t        deadline = 0;
1437         int             slept = 0;
1438         int             still_shared = 0;
1439         wait_result_t   res;
1440         boolean_t       istate = -1;
1441
1442 #if     CONFIG_DTRACE
1443         uint64_t wait_interval = 0;
1444         int readers_at_sleep = 0;
1445         boolean_t dtrace_ls_initialized = FALSE;
1446         boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1447 #endif
1448
1449         while (lck->lck_rw_shared_count != 0) {
1450
1451 #if     CONFIG_DTRACE
1452                 if (dtrace_ls_initialized == FALSE) {
1453                         dtrace_ls_initialized = TRUE;
1454                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1455                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1456                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1457                         if (dtrace_ls_enabled) {
1458                                 /*
1459                                  * Either sleeping or spinning is happening,
1460                                  *  start a timing of our delay interval now.
1461                                  */
1462                                 readers_at_sleep = lck->lck_rw_shared_count;
1463                                 wait_interval = mach_absolute_time();
1464                         }
1465                 }
1466 #endif
1467                 if (istate == -1)
1468                         istate = ml_get_interrupts_enabled();
1469
1470                 deadline = lck_rw_deadline_for_spin(lck);
1471
1472                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1473                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1474
1475                 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1476                         lck_rw_lock_pause(istate);
1477
1478                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1479                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1480
1481                 if ( !still_shared)
1482                         break;
1483                 /*
1484                  * if we get here, the deadline has expired w/o
1485                  * the rw_shared_count having drained to 0
1486                  * check to see if we're allowed to do a thread_block
1487                  */
1488                 if (lck->lck_rw_can_sleep) {
1489
1490                         istate = lck_interlock_lock(lck);
1491
1492                         if (lck->lck_rw_shared_count != 0) {
1493                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1494                                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1495
1496                                 lck->lck_w_waiting = TRUE;
1497
1498                                 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1499                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1500                                 lck_interlock_unlock(lck, istate);
1501
1502                                 if (res == THREAD_WAITING) {
1503                                         res = thread_block(THREAD_CONTINUE_NULL);
1504                                         slept++;
1505                                 }
1506                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1507                                              trace_lck, res, slept, 0, 0);
1508                         } else {
1509                                 lck_interlock_unlock(lck, istate);
1510                                 break;
1511                         }
1512                 }
1513         }
1514 #if     CONFIG_DTRACE
1515         /*
1516          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1517          */
1518         if (dtrace_ls_enabled == TRUE) {
1519                 if (slept == 0) {
1520                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1521                 } else {
1522                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1523                             mach_absolute_time() - wait_interval, 1,
1524                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1525                 }
1526         }
1527         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1528 #endif
1529         return (TRUE);
1530 }
1531
1532
1533 /*
1534  *      Routine:        lck_rw_lock_exclusive_to_shared
1535  *      Function:
1536  *              assembly fast path has already dropped
1537  *              our exclusive state and bumped lck_rw_shared_count
1538  *              all we need to do here is determine if anyone
1539  *              needs to be awakened.
1540  */
1541 void
1542 lck_rw_lock_exclusive_to_shared_gen(
1543         lck_rw_t        *lck,
1544         int             prior_lock_state)
1545 {
1546         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1547         lck_rw_t                *fake_lck;
1548
1549         /*
1550          * prior_lock state is a snapshot of the 1st word of the
1551          * lock in question... we'll fake up a pointer to it
1552          * and carefully not access anything beyond whats defined
1553          * in the first word of a lck_rw_t
1554          */
1555         fake_lck = (lck_rw_t *)&prior_lock_state;
1556
1557         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1558                              trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1559
1560         /*
1561          * don't wake up anyone waiting to take the lock exclusively
1562          * since we hold a read count... when the read count drops to 0,
1563          * the writers will be woken.
1564          *
1565          * wake up any waiting readers if we don't have any writers waiting,
1566          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1567          */
1568         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1569                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1570
1571         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1572                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1573
1574 #if CONFIG_DTRACE
1575         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1576 #endif
1577 }
1578
1579
1580 /*
1581  *      Routine:        lck_rw_try_lock
1582  */
1583 boolean_t
1584 lck_rw_try_lock(
1585         lck_rw_t        *lck,
1586         lck_rw_type_t   lck_rw_type)
1587 {
1588         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1589                 return(lck_rw_try_lock_shared(lck));
1590         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1591                 return(lck_rw_try_lock_exclusive(lck));
1592         else
1593                 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1594         return(FALSE);
1595 }
1596
1597
1598 void
1599 lck_rw_assert(
1600         lck_rw_t        *lck,
1601         unsigned int    type)
1602 {
1603         switch (type) {
1604         case LCK_RW_ASSERT_SHARED:
1605                 if (lck->lck_rw_shared_count != 0) {
1606                         return;
1607                 }
1608                 break;
1609         case LCK_RW_ASSERT_EXCLUSIVE:
1610                 if ((lck->lck_rw_want_write ||
1611                      lck->lck_rw_want_upgrade) &&
1612                     lck->lck_rw_shared_count == 0) {
1613                         return;
1614                 }
1615                 break;
1616         case LCK_RW_ASSERT_HELD:
1617                 if (lck->lck_rw_want_write ||
1618                     lck->lck_rw_want_upgrade ||
1619                     lck->lck_rw_shared_count != 0) {
1620                         return;
1621                 }
1622                 break;
1623         case LCK_RW_ASSERT_NOTHELD:
1624                 if (!(lck->lck_rw_want_write ||
1625                           lck->lck_rw_want_upgrade ||
1626                           lck->lck_rw_shared_count != 0)) {
1627                         return;
1628                 }
1629                 break;
1630         default:
1631                 break;
1632         }
1633
1634         panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1635 }
1636
1637 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1638 void
1639 lck_rw_clear_promotions_x86(thread_t thread)
1640 {
1641 #if MACH_LDEBUG
1642         /* It's fatal to leave a RW lock locked and return to userspace */
1643         panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1644 #else
1645         /* Paper over the issue */
1646         thread->rwlock_count = 0;
1647         lck_rw_clear_promotion(thread);
1648 #endif
1649 }
1650
1651
1652 /*
1653  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1654  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1655  */
1656 boolean_t
1657 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
1658         if (not_in_kdp) {
1659                 panic("panic: rw lock exclusive check done outside of kernel debugger");
1660         }
1661         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
1662 }
1663
1664
1665 #ifdef  MUTEX_ZONE
1666 extern zone_t lck_mtx_zone;
1667 #endif
1668 /*
1669  *      Routine:        lck_mtx_alloc_init
1670  */
1671 lck_mtx_t *
1672 lck_mtx_alloc_init(
1673         lck_grp_t       *grp,
1674         lck_attr_t      *attr)
1675 {
1676         lck_mtx_t       *lck;
1677 #ifdef  MUTEX_ZONE
1678         if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
1679                 lck_mtx_init(lck, grp, attr);
1680 #else
1681         if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
1682                 lck_mtx_init(lck, grp, attr);
1683 #endif
1684         return(lck);
1685 }
1686
1687 /*
1688  *      Routine:        lck_mtx_free
1689  */
1690 void
1691 lck_mtx_free(
1692         lck_mtx_t       *lck,
1693         lck_grp_t       *grp)
1694 {
1695         lck_mtx_destroy(lck, grp);
1696 #ifdef  MUTEX_ZONE
1697         zfree(lck_mtx_zone, lck);
1698 #else
1699         kfree(lck, sizeof(lck_mtx_t));
1700 #endif
1701 }
1702
1703 /*
1704  *      Routine:        lck_mtx_ext_init
1705  */
1706 static void
1707 lck_mtx_ext_init(
1708         lck_mtx_ext_t   *lck,
1709         lck_grp_t       *grp,
1710         lck_attr_t      *attr)
1711 {
1712         bzero((void *)lck, sizeof(lck_mtx_ext_t));
1713
1714         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1715                 lck->lck_mtx_deb.type = MUTEX_TAG;
1716                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
1717         }
1718
1719         lck->lck_mtx_grp = grp;
1720
1721         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
1722                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
1723
1724         lck->lck_mtx.lck_mtx_is_ext = 1;
1725         lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
1726 }
1727
1728 /*
1729  *      Routine:        lck_mtx_init
1730  */
1731 void
1732 lck_mtx_init(
1733         lck_mtx_t       *lck,
1734         lck_grp_t       *grp,
1735         lck_attr_t      *attr)
1736 {
1737         lck_mtx_ext_t   *lck_ext;
1738         lck_attr_t      *lck_attr;
1739
1740         if (attr != LCK_ATTR_NULL)
1741                 lck_attr = attr;
1742         else
1743                 lck_attr = &LockDefaultLckAttr;
1744
1745         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1746                 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
1747                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
1748                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1749                         lck->lck_mtx_ptr = lck_ext;
1750                 }
1751         } else {
1752                 lck->lck_mtx_owner = 0;
1753                 lck->lck_mtx_state = 0;
1754         }
1755         lck->lck_mtx_pad32 = 0xFFFFFFFF;
1756         lck_grp_reference(grp);
1757         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1758 }
1759
1760 /*
1761  *      Routine:        lck_mtx_init_ext
1762  */
1763 void
1764 lck_mtx_init_ext(
1765         lck_mtx_t       *lck,
1766         lck_mtx_ext_t   *lck_ext,
1767         lck_grp_t       *grp,
1768         lck_attr_t      *attr)
1769 {
1770         lck_attr_t      *lck_attr;
1771
1772         if (attr != LCK_ATTR_NULL)
1773                 lck_attr = attr;
1774         else
1775                 lck_attr = &LockDefaultLckAttr;
1776
1777         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1778                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
1779                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1780                 lck->lck_mtx_ptr = lck_ext;
1781         } else {
1782                 lck->lck_mtx_owner = 0;
1783                 lck->lck_mtx_state = 0;
1784         }
1785         lck->lck_mtx_pad32 = 0xFFFFFFFF;
1786
1787         lck_grp_reference(grp);
1788         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1789 }
1790
1791 /*
1792  *      Routine:        lck_mtx_destroy
1793  */
1794 void
1795 lck_mtx_destroy(
1796         lck_mtx_t       *lck,
1797         lck_grp_t       *grp)
1798 {
1799         boolean_t lck_is_indirect;
1800
1801         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
1802                 return;
1803 #if MACH_LDEBUG
1804         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1805 #endif
1806         lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1807
1808         lck_mtx_lock_mark_destroyed(lck);
1809
1810         if (lck_is_indirect)
1811                 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
1812         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1813         lck_grp_deallocate(grp);
1814         return;
1815 }
1816
1817
1818 #define LCK_MTX_LCK_WAIT_CODE           0x20
1819 #define LCK_MTX_LCK_WAKEUP_CODE         0x21
1820 #define LCK_MTX_LCK_SPIN_CODE           0x22
1821 #define LCK_MTX_LCK_ACQUIRE_CODE        0x23
1822 #define LCK_MTX_LCK_DEMOTE_CODE         0x24
1823
1824
1825 /*
1826  * Routine:     lck_mtx_unlock_wakeup_x86
1827  *
1828  * Invoked on unlock when there is
1829  * contention (i.e. the assembly routine sees that
1830  * that mutex->lck_mtx_waiters != 0 or
1831  * that mutex->lck_mtx_promoted != 0...
1832  *
1833  * neither the mutex or interlock is held
1834  */
1835 void
1836 lck_mtx_unlock_wakeup_x86 (
1837         lck_mtx_t       *mutex,
1838         int             prior_lock_state)
1839 {
1840         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1841         lck_mtx_t               fake_lck;
1842
1843         /*
1844          * prior_lock state is a snapshot of the 2nd word of the
1845          * lock in question... we'll fake up a lock with the bits
1846          * copied into place and carefully not access anything
1847          * beyond whats defined in the second word of a lck_mtx_t
1848          */
1849         fake_lck.lck_mtx_state = prior_lock_state;
1850
1851         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
1852                      trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
1853
1854         if (__probable(fake_lck.lck_mtx_waiters)) {
1855                 if (fake_lck.lck_mtx_waiters > 1)
1856                         thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
1857                 else
1858                         thread_wakeup_one(LCK_MTX_EVENT(mutex));
1859         }
1860
1861         if (__improbable(fake_lck.lck_mtx_promoted)) {
1862                 thread_t        thread = current_thread();
1863
1864
1865                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
1866                              thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
1867
1868                 if (thread->promotions > 0) {
1869                         spl_t   s = splsched();
1870
1871                         thread_lock(thread);
1872
1873                         if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
1874
1875                                 thread->sched_flags &= ~TH_SFLAG_PROMOTED;
1876
1877                                 if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
1878                                         /* Thread still has a RW lock promotion */
1879                                 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1880                                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1881                                                               thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
1882
1883                                         set_sched_pri(thread, DEPRESSPRI);
1884                                 }
1885                                 else {
1886                                         if (thread->base_pri < thread->sched_pri) {
1887                                                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1888                                                                       thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
1889
1890                                                 thread_recompute_sched_pri(thread, FALSE);
1891                                         }
1892                                 }
1893                         }
1894                         thread_unlock(thread);
1895                         splx(s);
1896                 }
1897         }
1898         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
1899                      trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1900 }
1901
1902
1903 /*
1904  * Routine:     lck_mtx_lock_acquire_x86
1905  *
1906  * Invoked on acquiring the mutex when there is
1907  * contention (i.e. the assembly routine sees that
1908  * that mutex->lck_mtx_waiters != 0 or
1909  * thread->was_promoted_on_wakeup != 0)...
1910  *
1911  * mutex is owned...  interlock is held... preemption is disabled
1912  */
1913 void
1914 lck_mtx_lock_acquire_x86(
1915         lck_mtx_t       *mutex)
1916 {
1917         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1918         thread_t                thread;
1919         integer_t               priority;
1920         spl_t                   s;
1921
1922         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
1923                      trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
1924
1925         if (mutex->lck_mtx_waiters)
1926                 priority = mutex->lck_mtx_pri;
1927         else
1928                 priority = 0;
1929
1930         thread = (thread_t)mutex->lck_mtx_owner;        /* faster then current_thread() */
1931
1932         if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
1933
1934                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
1935                                       thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0);
1936
1937                 s = splsched();
1938                 thread_lock(thread);
1939
1940                 if (thread->sched_pri < priority) {
1941                         /* Do not promote past promotion ceiling */
1942                         assert(priority <= MAXPRI_PROMOTE);
1943                         set_sched_pri(thread, priority);
1944                 }
1945                 if (mutex->lck_mtx_promoted == 0) {
1946                         mutex->lck_mtx_promoted = 1;
1947
1948                         thread->promotions++;
1949                         thread->sched_flags |= TH_SFLAG_PROMOTED;
1950                 }
1951                 thread->was_promoted_on_wakeup = 0;
1952
1953                 thread_unlock(thread);
1954                 splx(s);
1955         }
1956         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
1957                      trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1958 }
1959
1960
1961 static int
1962 lck_mtx_interlock_try_lock(lck_mtx_t *mutex, boolean_t *istate)
1963 {
1964         int             retval;
1965
1966         *istate = ml_set_interrupts_enabled(FALSE);
1967         retval = lck_mtx_ilk_try_lock(mutex);
1968
1969         if (retval == 0)
1970                 ml_set_interrupts_enabled(*istate);
1971
1972         return retval;
1973 }
1974
1975 static void
1976 lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
1977 {
1978         lck_mtx_ilk_unlock(mutex);
1979         ml_set_interrupts_enabled(istate);
1980 }
1981
1982
1983 /*
1984  * Routine:     lck_mtx_lock_spinwait_x86
1985  *
1986  * Invoked trying to acquire a mutex when there is contention but
1987  * the holder is running on another processor. We spin for up to a maximum
1988  * time waiting for the lock to be released.
1989  *
1990  * Called with the interlock unlocked.
1991  * returns 0 if mutex acquired
1992  * returns 1 if we spun
1993  * returns 2 if we didn't spin due to the holder not running
1994  */
1995 int
1996 lck_mtx_lock_spinwait_x86(
1997         lck_mtx_t       *mutex)
1998 {
1999         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2000         thread_t        holder;
2001         uint64_t        overall_deadline;
2002         uint64_t        check_owner_deadline;
2003         uint64_t        cur_time;
2004         int             retval = 1;
2005         int             loopcount = 0;
2006
2007         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2008                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
2009
2010         cur_time = mach_absolute_time();
2011         overall_deadline = cur_time + MutexSpin;
2012         check_owner_deadline = cur_time;
2013
2014         /*
2015          * Spin while:
2016          *   - mutex is locked, and
2017          *   - its locked as a spin lock, and
2018          *   - owner is running on another processor, and
2019          *   - owner (processor) is not idling, and
2020          *   - we haven't spun for long enough.
2021          */
2022         do {
2023                 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
2024                         retval = 0;
2025                         break;
2026                 }
2027                 cur_time = mach_absolute_time();
2028
2029                 if (cur_time >= overall_deadline)
2030                         break;
2031
2032                 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
2033                         boolean_t       istate;
2034
2035                         if (lck_mtx_interlock_try_lock(mutex, &istate)) {
2036
2037                                 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
2038
2039                                         if ( !(holder->machine.specFlags & OnProc) ||
2040                                              (holder->state & TH_IDLE)) {
2041
2042                                                 lck_mtx_interlock_unlock(mutex, istate);
2043
2044                                                 if (loopcount == 0)
2045                                                         retval = 2;
2046                                                 break;
2047                                         }
2048                                 }
2049                                 lck_mtx_interlock_unlock(mutex, istate);
2050
2051                                 check_owner_deadline = cur_time + (MutexSpin / 4);
2052                         }
2053                 }
2054                 cpu_pause();
2055
2056                 loopcount++;
2057
2058         } while (TRUE);
2059
2060 #if     CONFIG_DTRACE
2061         /*
2062          * We've already kept a count via overall_deadline of how long we spun.
2063          * If dtrace is active, then we compute backwards to decide how
2064          * long we spun.
2065          *
2066          * Note that we record a different probe id depending on whether
2067          * this is a direct or indirect mutex.  This allows us to
2068          * penalize only lock groups that have debug/stats enabled
2069          * with dtrace processing if desired.
2070          */
2071         if (__probable(mutex->lck_mtx_is_ext == 0)) {
2072                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
2073                         mach_absolute_time() - (overall_deadline - MutexSpin));
2074         } else {
2075                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
2076                         mach_absolute_time() - (overall_deadline - MutexSpin));
2077         }
2078         /* The lockstat acquire event is recorded by the assembly code beneath us. */
2079 #endif
2080
2081         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2082                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
2083
2084         return retval;
2085 }
2086
2087
2088
2089 /*
2090  * Routine:     lck_mtx_lock_wait_x86
2091  *
2092  * Invoked in order to wait on contention.
2093  *
2094  * Called with the interlock locked and
2095  * preemption disabled...
2096  * returns it unlocked and with preemption enabled
2097  */
2098 void
2099 lck_mtx_lock_wait_x86 (
2100         lck_mtx_t       *mutex)
2101 {
2102         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2103         thread_t        self = current_thread();
2104         thread_t        holder;
2105         integer_t       priority;
2106         spl_t           s;
2107 #if     CONFIG_DTRACE
2108         uint64_t        sleep_start = 0;
2109
2110         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
2111                 sleep_start = mach_absolute_time();
2112         }
2113 #endif
2114         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
2115                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2116
2117         priority = self->sched_pri;
2118
2119         if (priority < self->base_pri)
2120                 priority = self->base_pri;
2121         if (priority < BASEPRI_DEFAULT)
2122                 priority = BASEPRI_DEFAULT;
2123
2124         /* Do not promote past promotion ceiling */
2125         priority = MIN(priority, MAXPRI_PROMOTE);
2126
2127         if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
2128                 mutex->lck_mtx_pri = priority;
2129         mutex->lck_mtx_waiters++;
2130
2131         if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
2132              holder->sched_pri < mutex->lck_mtx_pri ) {
2133                 s = splsched();
2134                 thread_lock(holder);
2135
2136                 /* holder priority may have been bumped by another thread
2137                  * before thread_lock was taken
2138                  */
2139                 if (holder->sched_pri < mutex->lck_mtx_pri) {
2140                         KERNEL_DEBUG_CONSTANT(
2141                                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2142                                 holder->sched_pri, priority, thread_tid(holder), trace_lck, 0);
2143                         /* Assert that we're not altering the priority of a
2144                          * thread above the MAXPRI_PROMOTE band
2145                          */
2146                         assert(holder->sched_pri < MAXPRI_PROMOTE);
2147                         set_sched_pri(holder, priority);
2148
2149                         if (mutex->lck_mtx_promoted == 0) {
2150                                 holder->promotions++;
2151                                 holder->sched_flags |= TH_SFLAG_PROMOTED;
2152
2153                                 mutex->lck_mtx_promoted = 1;
2154                         }
2155                 }
2156                 thread_unlock(holder);
2157                 splx(s);
2158         }
2159         thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
2160         assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
2161
2162         lck_mtx_ilk_unlock(mutex);
2163
2164         thread_block(THREAD_CONTINUE_NULL);
2165
2166         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2167                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2168
2169 #if     CONFIG_DTRACE
2170         /*
2171          * Record the Dtrace lockstat probe for blocking, block time
2172          * measured from when we were entered.
2173          */
2174         if (sleep_start) {
2175                 if (mutex->lck_mtx_is_ext == 0) {
2176                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2177                             mach_absolute_time() - sleep_start);
2178                 } else {
2179                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2180                             mach_absolute_time() - sleep_start);
2181                 }
2182         }
2183 #endif
2184 }
2185
2186 /*
2187  *      Routine: kdp_lck_mtx_lock_spin_is_acquired
2188  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2189  *      Returns: TRUE if lock is acquired.
2190  */
2191 boolean_t
2192 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t     *lck)
2193 {
2194         if (not_in_kdp) {
2195                 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
2196         }
2197
2198         if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
2199                 return TRUE;
2200         }
2201
2202         return FALSE;
2203 }
2204
2205 void
2206 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2207 {
2208         lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
2209         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2210         thread_t holder   = (thread_t)mutex->lck_mtx_owner;
2211         waitinfo->owner   = thread_tid(holder);
2212 }
2213
2214 void
2215 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2216 {
2217         lck_rw_t *rwlck = NULL;
2218         switch(waitinfo->wait_type) {
2219                 case kThreadWaitKernelRWLockRead:
2220                         rwlck = READ_EVENT_TO_RWLOCK(event);
2221                         break;
2222                 case kThreadWaitKernelRWLockWrite:
2223                 case kThreadWaitKernelRWLockUpgrade:
2224                         rwlck = WRITE_EVENT_TO_RWLOCK(event);
2225                         break;
2226                 default:
2227                         panic("%s was called with an invalid blocking type", __FUNCTION__);
2228                         break;
2229         }
2230         waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2231         waitinfo->owner = 0;
2232 }