osfmk/i386/locks_i386.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  *      File:   kern/lock.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *      Date:   1985
  60  *
  61  *      Locking primitives implementation
  62  */
  63
  64 #include <mach_ldebug.h>
  65
  66 #include <kern/locks.h>
  67 #include <kern/kalloc.h>
  68 #include <kern/misc_protos.h>
  69 #include <kern/thread.h>
  70 #include <kern/processor.h>
  71 #include <kern/cpu_data.h>
  72 #include <kern/cpu_number.h>
  73 #include <kern/sched_prim.h>
  74 #include <kern/xpr.h>
  75 #include <kern/debug.h>
  76 #include <string.h>
  77
  78 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
  79 #include <machine/machine_cpu.h>
  80 #include <i386/mp.h>
  81
  82 #include <sys/kdebug.h>
  83 #include <mach/branch_predicates.h>
  84
  85 /*
  86  * We need only enough declarations from the BSD-side to be able to
  87  * test if our probe is active, and to call __dtrace_probe().  Setting
  88  * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
  89  */
  90 #if     CONFIG_DTRACE
  91 #define NEED_DTRACE_DEFS
  92 #include <../bsd/sys/lockstat.h>
  93 #endif
  94
  95 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  96 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
  97 #define LCK_RW_LCK_SHARED_CODE          0x102
  98 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
  99 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
 100 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 101
 102 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
 103 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
 104 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
 105 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
 106 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
 107 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
 108 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
 109 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
 110
 111
 112 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 113
 114 unsigned int LcksOpts=0;
 115
 116 /* Forwards */
 117
 118 #if     USLOCK_DEBUG
 119 /*
 120  *      Perform simple lock checks.
 121  */
 122 int     uslock_check = 1;
 123 int     max_lock_loops  = 100000000;
 124 decl_simple_lock_data(extern , printf_lock)
 125 decl_simple_lock_data(extern , panic_lock)
 126 #endif  /* USLOCK_DEBUG */
 127
 128 extern unsigned int not_in_kdp;
 129
 130 /*
 131  *      We often want to know the addresses of the callers
 132  *      of the various lock routines.  However, this information
 133  *      is only used for debugging and statistics.
 134  */
 135 typedef void    *pc_t;
 136 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 137 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 138 #if     ANY_LOCK_DEBUG
 139 #define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
 140 #define DECL_PC(pc)     pc_t pc;
 141 #else   /* ANY_LOCK_DEBUG */
 142 #define DECL_PC(pc)
 143 #ifdef  lint
 144 /*
 145  *      Eliminate lint complaints about unused local pc variables.
 146  */
 147 #define OBTAIN_PC(pc)   ++pc
 148 #else   /* lint */
 149 #define OBTAIN_PC(pc)
 150 #endif  /* lint */
 151 #endif  /* USLOCK_DEBUG */
 152
 153
 154 /*
 155  *      Portable lock package implementation of usimple_locks.
 156  */
 157
 158 #if     USLOCK_DEBUG
 159 #define USLDBG(stmt)    stmt
 160 void            usld_lock_init(usimple_lock_t, unsigned short);
 161 void            usld_lock_pre(usimple_lock_t, pc_t);
 162 void            usld_lock_post(usimple_lock_t, pc_t);
 163 void            usld_unlock(usimple_lock_t, pc_t);
 164 void            usld_lock_try_pre(usimple_lock_t, pc_t);
 165 void            usld_lock_try_post(usimple_lock_t, pc_t);
 166 int             usld_lock_common_checks(usimple_lock_t, char *);
 167 #else   /* USLOCK_DEBUG */
 168 #define USLDBG(stmt)
 169 #endif  /* USLOCK_DEBUG */
 170
 171
 172 extern int lck_rw_grab_want(lck_rw_t *lck);
 173 extern int lck_rw_grab_shared(lck_rw_t *lck);
 174 extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
 175
 176
 177 /*
 178  * Forward definitions
 179  */
 180
 181 void lck_rw_lock_shared_gen(
 182         lck_rw_t        *lck);
 183
 184 void lck_rw_lock_exclusive_gen(
 185         lck_rw_t        *lck);
 186
 187 boolean_t lck_rw_lock_shared_to_exclusive_success(
 188         lck_rw_t        *lck);
 189
 190 boolean_t lck_rw_lock_shared_to_exclusive_failure(
 191         lck_rw_t        *lck,
 192         int             prior_lock_state);
 193
 194 void lck_rw_lock_exclusive_to_shared_gen(
 195         lck_rw_t        *lck,
 196         int             prior_lock_state);
 197
 198 lck_rw_type_t lck_rw_done_gen(
 199         lck_rw_t        *lck,
 200         int             prior_lock_state);
 201
 202 void lck_rw_clear_promotions_x86(thread_t thread);
 203
 204 /*
 205  *      Routine:        lck_spin_alloc_init
 206  */
 207 lck_spin_t *
 208 lck_spin_alloc_init(
 209         lck_grp_t       *grp,
 210         lck_attr_t      *attr)
 211 {
 212         lck_spin_t      *lck;
 213
 214         if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
 215                 lck_spin_init(lck, grp, attr);
 216
 217         return(lck);
 218 }
 219
 220 /*
 221  *      Routine:        lck_spin_free
 222  */
 223 void
 224 lck_spin_free(
 225         lck_spin_t      *lck,
 226         lck_grp_t       *grp)
 227 {
 228         lck_spin_destroy(lck, grp);
 229         kfree(lck, sizeof(lck_spin_t));
 230 }
 231
 232 /*
 233  *      Routine:        lck_spin_init
 234  */
 235 void
 236 lck_spin_init(
 237         lck_spin_t      *lck,
 238         lck_grp_t       *grp,
 239         __unused lck_attr_t     *attr)
 240 {
 241         usimple_lock_init((usimple_lock_t) lck, 0);
 242         lck_grp_reference(grp);
 243         lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 244 }
 245
 246 /*
 247  *      Routine:        lck_spin_destroy
 248  */
 249 void
 250 lck_spin_destroy(
 251         lck_spin_t      *lck,
 252         lck_grp_t       *grp)
 253 {
 254         if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
 255                 return;
 256         lck->interlock = LCK_SPIN_TAG_DESTROYED;
 257         lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 258         lck_grp_deallocate(grp);
 259         return;
 260 }
 261
 262 /*
 263  *      Routine:        lck_spin_lock
 264  */
 265 void
 266 lck_spin_lock(
 267         lck_spin_t      *lck)
 268 {
 269         usimple_lock((usimple_lock_t) lck);
 270 }
 271
 272 /*
 273  *      Routine:        lck_spin_unlock
 274  */
 275 void
 276 lck_spin_unlock(
 277         lck_spin_t      *lck)
 278 {
 279         usimple_unlock((usimple_lock_t) lck);
 280 }
 281
 282
 283 /*
 284  *      Routine:        lck_spin_try_lock
 285  */
 286 boolean_t
 287 lck_spin_try_lock(
 288         lck_spin_t      *lck)
 289 {
 290         boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
 291 #if     DEVELOPMENT || DEBUG
 292         if (lrval) {
 293                 pltrace(FALSE);
 294         }
 295 #endif
 296         return(lrval);
 297 }
 298
 299 /*
 300  *      Routine:        lck_spin_assert
 301  */
 302 void
 303 lck_spin_assert(lck_spin_t *lock, unsigned int type)
 304 {
 305         thread_t thread, holder;
 306         uintptr_t state;
 307
 308         if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
 309                 panic("lck_spin_assert(): invalid arg (%u)", type);
 310         }
 311
 312         state = lock->interlock;
 313         holder = (thread_t)state;
 314         thread = current_thread();
 315         if (type == LCK_ASSERT_OWNED) {
 316                 if (__improbable(holder == THREAD_NULL)) {
 317                         panic("Lock not owned %p = %lx", lock, state);
 318                 }
 319                 if (__improbable(holder != thread)) {
 320                         panic("Lock not owned by current thread %p = %lx", lock, state);
 321                 }
 322         } else if (type == LCK_ASSERT_NOTOWNED) {
 323                 if (__improbable(holder != THREAD_NULL)) {
 324                         if (holder == thread) {
 325                                 panic("Lock owned by current thread %p = %lx", lock, state);
 326                         } else {
 327                                 panic("Lock %p owned by thread %p", lock, holder);
 328                         }
 329                 }
 330         }
 331 }
 332
 333 /*
 334  *      Routine: kdp_lck_spin_is_acquired
 335  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 336  *      Returns: TRUE if lock is acquired.
 337  */
 338 boolean_t
 339 kdp_lck_spin_is_acquired(lck_spin_t *lck) {
 340         if (not_in_kdp) {
 341                 panic("panic: spinlock acquired check done outside of kernel debugger");
 342         }
 343         return (lck->interlock != 0)? TRUE : FALSE;
 344 }
 345
 346 /*
 347  *      Initialize a usimple_lock.
 348  *
 349  *      No change in preemption state.
 350  */
 351 void
 352 usimple_lock_init(
 353         usimple_lock_t  l,
 354         __unused unsigned short tag)
 355 {
 356 #ifndef MACHINE_SIMPLE_LOCK
 357         USLDBG(usld_lock_init(l, tag));
 358         hw_lock_init(&l->interlock);
 359 #else
 360         simple_lock_init((simple_lock_t)l,tag);
 361 #endif
 362 }
 363
 364 volatile uint32_t spinlock_owner_cpu = ~0;
 365 volatile usimple_lock_t spinlock_timed_out;
 366
 367 uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
 368         uint64_t deadline;
 369         uint32_t i;
 370
 371         for (i = 0; i < real_ncpus; i++) {
 372                 if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
 373                         spinlock_owner_cpu = i;
 374                         if ((uint32_t) cpu_number() == i)
 375                                 break;
 376                         cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
 377                         cpu_NMI_interrupt(i);
 378                         deadline = mach_absolute_time() + (LockTimeOut * 2);
 379                         while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
 380                                 cpu_pause();
 381                         break;
 382                 }
 383         }
 384
 385         return spinlock_owner_cpu;
 386 }
 387
 388 /*
 389  *      Acquire a usimple_lock.
 390  *
 391  *      Returns with preemption disabled.  Note
 392  *      that the hw_lock routines are responsible for
 393  *      maintaining preemption state.
 394  */
 395 void
 396 usimple_lock(
 397         usimple_lock_t  l)
 398 {
 399 #ifndef MACHINE_SIMPLE_LOCK
 400         DECL_PC(pc);
 401
 402         OBTAIN_PC(pc);
 403         USLDBG(usld_lock_pre(l, pc));
 404
 405         if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))        {
 406                 boolean_t uslock_acquired = FALSE;
 407                 while (machine_timeout_suspended()) {
 408                         enable_preemption();
 409                         if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
 410                                 break;
 411                 }
 412
 413                 if (uslock_acquired == FALSE) {
 414                         uint32_t lock_cpu;
 415                         uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
 416                         spinlock_timed_out = l;
 417                         lock_cpu = spinlock_timeout_NMI(lowner);
 418                         panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
 419                 }
 420         }
 421 #if DEVELOPMENT || DEBUG
 422                 pltrace(FALSE);
 423 #endif
 424
 425         USLDBG(usld_lock_post(l, pc));
 426 #else
 427         simple_lock((simple_lock_t)l);
 428 #endif
 429 }
 430
 431
 432 /*
 433  *      Release a usimple_lock.
 434  *
 435  *      Returns with preemption enabled.  Note
 436  *      that the hw_lock routines are responsible for
 437  *      maintaining preemption state.
 438  */
 439 void
 440 usimple_unlock(
 441         usimple_lock_t  l)
 442 {
 443 #ifndef MACHINE_SIMPLE_LOCK
 444         DECL_PC(pc);
 445
 446         OBTAIN_PC(pc);
 447         USLDBG(usld_unlock(l, pc));
 448 #if DEVELOPMENT || DEBUG
 449                 pltrace(TRUE);
 450 #endif
 451         hw_lock_unlock(&l->interlock);
 452 #else
 453         simple_unlock_rwmb((simple_lock_t)l);
 454 #endif
 455 }
 456
 457
 458 /*
 459  *      Conditionally acquire a usimple_lock.
 460  *
 461  *      On success, returns with preemption disabled.
 462  *      On failure, returns with preemption in the same state
 463  *      as when first invoked.  Note that the hw_lock routines
 464  *      are responsible for maintaining preemption state.
 465  *
 466  *      XXX No stats are gathered on a miss; I preserved this
 467  *      behavior from the original assembly-language code, but
 468  *      doesn't it make sense to log misses?  XXX
 469  */
 470 unsigned int
 471 usimple_lock_try(
 472         usimple_lock_t  l)
 473 {
 474 #ifndef MACHINE_SIMPLE_LOCK
 475         unsigned int    success;
 476         DECL_PC(pc);
 477
 478         OBTAIN_PC(pc);
 479         USLDBG(usld_lock_try_pre(l, pc));
 480         if ((success = hw_lock_try(&l->interlock))) {
 481 #if DEVELOPMENT || DEBUG
 482                 pltrace(FALSE);
 483 #endif
 484         USLDBG(usld_lock_try_post(l, pc));
 485         }
 486         return success;
 487 #else
 488         return(simple_lock_try((simple_lock_t)l));
 489 #endif
 490 }
 491
 492 /*
 493  * Acquire a usimple_lock while polling for pending TLB flushes
 494  * and spinning on a lock.
 495  *
 496  */
 497 void
 498 usimple_lock_try_lock_loop(usimple_lock_t l)
 499 {
 500         boolean_t istate = ml_get_interrupts_enabled();
 501         while (!simple_lock_try((l))) {
 502                 if (!istate)
 503                         handle_pending_TLB_flushes();
 504                 cpu_pause();
 505         }
 506 }
 507
 508 #if     USLOCK_DEBUG
 509 /*
 510  *      States of a usimple_lock.  The default when initializing
 511  *      a usimple_lock is setting it up for debug checking.
 512  */
 513 #define USLOCK_CHECKED          0x0001          /* lock is being checked */
 514 #define USLOCK_TAKEN            0x0002          /* lock has been taken */
 515 #define USLOCK_INIT             0xBAA0          /* lock has been initialized */
 516 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 517 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 518                                  ((l)->debug.state & USLOCK_CHECKED))
 519
 520 /*
 521  *      Trace activities of a particularly interesting lock.
 522  */
 523 void    usl_trace(usimple_lock_t, int, pc_t, const char *);
 524
 525
 526 /*
 527  *      Initialize the debugging information contained
 528  *      in a usimple_lock.
 529  */
 530 void
 531 usld_lock_init(
 532         usimple_lock_t  l,
 533         __unused unsigned short tag)
 534 {
 535         if (l == USIMPLE_LOCK_NULL)
 536                 panic("lock initialization:  null lock pointer");
 537         l->lock_type = USLOCK_TAG;
 538         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 539         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 540         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 541         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 542         l->debug.duration[0] = l->debug.duration[1] = 0;
 543         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 544         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 545         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 546 }
 547
 548
 549 /*
 550  *      These checks apply to all usimple_locks, not just
 551  *      those with USLOCK_CHECKED turned on.
 552  */
 553 int
 554 usld_lock_common_checks(
 555         usimple_lock_t  l,
 556         char            *caller)
 557 {
 558         if (l == USIMPLE_LOCK_NULL)
 559                 panic("%s:  null lock pointer", caller);
 560         if (l->lock_type != USLOCK_TAG)
 561                 panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
 562         if (!(l->debug.state & USLOCK_INIT))
 563                 panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
 564         return USLOCK_CHECKING(l);
 565 }
 566
 567
 568 /*
 569  *      Debug checks on a usimple_lock just before attempting
 570  *      to acquire it.
 571  */
 572 /* ARGSUSED */
 573 void
 574 usld_lock_pre(
 575         usimple_lock_t  l,
 576         pc_t            pc)
 577 {
 578         char    caller[] = "usimple_lock";
 579
 580
 581         if (!usld_lock_common_checks(l, caller))
 582                 return;
 583
 584 /*
 585  *      Note that we have a weird case where we are getting a lock when we are]
 586  *      in the process of putting the system to sleep. We are running with no
 587  *      current threads, therefore we can't tell if we are trying to retake a lock
 588  *      we have or someone on the other processor has it.  Therefore we just
 589  *      ignore this test if the locking thread is 0.
 590  */
 591
 592         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 593             l->debug.lock_thread == (void *) current_thread()) {
 594                 printf("%s:  lock %p already locked (at %p) by",
 595                       caller, l, l->debug.lock_pc);
 596                 printf(" current thread %p (new attempt at pc %p)\n",
 597                        l->debug.lock_thread, pc);
 598                 panic("%s", caller);
 599         }
 600         mp_disable_preemption();
 601         usl_trace(l, cpu_number(), pc, caller);
 602         mp_enable_preemption();
 603 }
 604
 605
 606 /*
 607  *      Debug checks on a usimple_lock just after acquiring it.
 608  *
 609  *      Pre-emption has been disabled at this point,
 610  *      so we are safe in using cpu_number.
 611  */
 612 void
 613 usld_lock_post(
 614         usimple_lock_t  l,
 615         pc_t            pc)
 616 {
 617         int     mycpu;
 618         char    caller[] = "successful usimple_lock";
 619
 620
 621         if (!usld_lock_common_checks(l, caller))
 622                 return;
 623
 624         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 625                 panic("%s:  lock %p became uninitialized",
 626                       caller, l);
 627         if ((l->debug.state & USLOCK_TAKEN))
 628                 panic("%s:  lock 0x%p became TAKEN by someone else",
 629                       caller, l);
 630
 631         mycpu = cpu_number();
 632         l->debug.lock_thread = (void *)current_thread();
 633         l->debug.state |= USLOCK_TAKEN;
 634         l->debug.lock_pc = pc;
 635         l->debug.lock_cpu = mycpu;
 636
 637         usl_trace(l, mycpu, pc, caller);
 638 }
 639
 640
 641 /*
 642  *      Debug checks on a usimple_lock just before
 643  *      releasing it.  Note that the caller has not
 644  *      yet released the hardware lock.
 645  *
 646  *      Preemption is still disabled, so there's
 647  *      no problem using cpu_number.
 648  */
 649 void
 650 usld_unlock(
 651         usimple_lock_t  l,
 652         pc_t            pc)
 653 {
 654         int     mycpu;
 655         char    caller[] = "usimple_unlock";
 656
 657
 658         if (!usld_lock_common_checks(l, caller))
 659                 return;
 660
 661         mycpu = cpu_number();
 662
 663         if (!(l->debug.state & USLOCK_TAKEN))
 664                 panic("%s:  lock 0x%p hasn't been taken",
 665                       caller, l);
 666         if (l->debug.lock_thread != (void *) current_thread())
 667                 panic("%s:  unlocking lock 0x%p, owned by thread %p",
 668                       caller, l, l->debug.lock_thread);
 669         if (l->debug.lock_cpu != mycpu) {
 670                 printf("%s:  unlocking lock 0x%p on cpu 0x%x",
 671                        caller, l, mycpu);
 672                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 673                 panic("%s", caller);
 674         }
 675         usl_trace(l, mycpu, pc, caller);
 676
 677         l->debug.unlock_thread = l->debug.lock_thread;
 678         l->debug.lock_thread = INVALID_PC;
 679         l->debug.state &= ~USLOCK_TAKEN;
 680         l->debug.unlock_pc = pc;
 681         l->debug.unlock_cpu = mycpu;
 682 }
 683
 684
 685 /*
 686  *      Debug checks on a usimple_lock just before
 687  *      attempting to acquire it.
 688  *
 689  *      Preemption isn't guaranteed to be disabled.
 690  */
 691 void
 692 usld_lock_try_pre(
 693         usimple_lock_t  l,
 694         pc_t            pc)
 695 {
 696         char    caller[] = "usimple_lock_try";
 697
 698         if (!usld_lock_common_checks(l, caller))
 699                 return;
 700         mp_disable_preemption();
 701         usl_trace(l, cpu_number(), pc, caller);
 702         mp_enable_preemption();
 703 }
 704
 705
 706 /*
 707  *      Debug checks on a usimple_lock just after
 708  *      successfully attempting to acquire it.
 709  *
 710  *      Preemption has been disabled by the
 711  *      lock acquisition attempt, so it's safe
 712  *      to use cpu_number.
 713  */
 714 void
 715 usld_lock_try_post(
 716         usimple_lock_t  l,
 717         pc_t            pc)
 718 {
 719         int     mycpu;
 720         char    caller[] = "successful usimple_lock_try";
 721
 722         if (!usld_lock_common_checks(l, caller))
 723                 return;
 724
 725         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 726                 panic("%s:  lock 0x%p became uninitialized",
 727                       caller, l);
 728         if ((l->debug.state & USLOCK_TAKEN))
 729                 panic("%s:  lock 0x%p became TAKEN by someone else",
 730                       caller, l);
 731
 732         mycpu = cpu_number();
 733         l->debug.lock_thread = (void *) current_thread();
 734         l->debug.state |= USLOCK_TAKEN;
 735         l->debug.lock_pc = pc;
 736         l->debug.lock_cpu = mycpu;
 737
 738         usl_trace(l, mycpu, pc, caller);
 739 }
 740
 741
 742 /*
 743  *      For very special cases, set traced_lock to point to a
 744  *      specific lock of interest.  The result is a series of
 745  *      XPRs showing lock operations on that lock.  The lock_seq
 746  *      value is used to show the order of those operations.
 747  */
 748 usimple_lock_t          traced_lock;
 749 unsigned int            lock_seq;
 750
 751 void
 752 usl_trace(
 753         usimple_lock_t  l,
 754         int             mycpu,
 755         pc_t            pc,
 756         const char *    op_name)
 757 {
 758         if (traced_lock == l) {
 759                 XPR(XPR_SLOCK,
 760                     "seq %d, cpu %d, %s @ %x\n",
 761                     (uintptr_t) lock_seq, (uintptr_t) mycpu,
 762                     (uintptr_t) op_name, (uintptr_t) pc, 0);
 763                 lock_seq++;
 764         }
 765 }
 766
 767
 768 #endif  /* USLOCK_DEBUG */
 769
 770 /*
 771  *      Routine:        lck_rw_alloc_init
 772  */
 773 lck_rw_t *
 774 lck_rw_alloc_init(
 775         lck_grp_t       *grp,
 776         lck_attr_t      *attr) {
 777         lck_rw_t        *lck;
 778
 779         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 780                 bzero(lck, sizeof(lck_rw_t));
 781                 lck_rw_init(lck, grp, attr);
 782         }
 783
 784         return(lck);
 785 }
 786
 787 /*
 788  *      Routine:        lck_rw_free
 789  */
 790 void
 791 lck_rw_free(
 792         lck_rw_t        *lck,
 793         lck_grp_t       *grp) {
 794         lck_rw_destroy(lck, grp);
 795         kfree(lck, sizeof(lck_rw_t));
 796 }
 797
 798 /*
 799  *      Routine:        lck_rw_init
 800  */
 801 void
 802 lck_rw_init(
 803         lck_rw_t        *lck,
 804         lck_grp_t       *grp,
 805         lck_attr_t      *attr)
 806 {
 807         lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
 808                                         attr : &LockDefaultLckAttr;
 809
 810         hw_lock_byte_init(&lck->lck_rw_interlock);
 811         lck->lck_rw_want_write = FALSE;
 812         lck->lck_rw_want_upgrade = FALSE;
 813         lck->lck_rw_shared_count = 0;
 814         lck->lck_rw_can_sleep = TRUE;
 815         lck->lck_r_waiting = lck->lck_w_waiting = 0;
 816         lck->lck_rw_tag = 0;
 817         lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
 818                                 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
 819
 820         lck_grp_reference(grp);
 821         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 822 }
 823
 824 /*
 825  *      Routine:        lck_rw_destroy
 826  */
 827 void
 828 lck_rw_destroy(
 829         lck_rw_t        *lck,
 830         lck_grp_t       *grp)
 831 {
 832         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
 833                 return;
 834 #if MACH_LDEBUG
 835         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 836 #endif
 837         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 838         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 839         lck_grp_deallocate(grp);
 840         return;
 841 }
 842
 843 /*
 844  *      Sleep locks.  These use the same data structure and algorithm
 845  *      as the spin locks, but the process sleeps while it is waiting
 846  *      for the lock.  These work on uniprocessor systems.
 847  */
 848
 849 #define DECREMENTER_TIMEOUT 1000000
 850
 851 #define RW_LOCK_READER_EVENT(x)         \
 852                 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
 853
 854 #define RW_LOCK_WRITER_EVENT(x)         \
 855                 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
 856
 857 /*
 858  * We disable interrupts while holding the RW interlock to prevent an
 859  * interrupt from exacerbating hold time.
 860  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 861  */
 862 static boolean_t
 863 lck_interlock_lock(lck_rw_t *lck)
 864 {
 865         boolean_t       istate;
 866
 867         istate = ml_set_interrupts_enabled(FALSE);
 868         hw_lock_byte_lock(&lck->lck_rw_interlock);
 869
 870         return istate;
 871 }
 872
 873 static void
 874 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 875 {
 876         hw_lock_byte_unlock(&lck->lck_rw_interlock);
 877         ml_set_interrupts_enabled(istate);
 878 }
 879
 880 /*
 881  * This inline is used when busy-waiting for an rw lock.
 882  * If interrupts were disabled when the lock primitive was called,
 883  * we poll the IPI handler for pending tlb flushes.
 884  * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
 885  */
 886 static inline void
 887 lck_rw_lock_pause(boolean_t interrupts_enabled)
 888 {
 889         if (!interrupts_enabled)
 890                 handle_pending_TLB_flushes();
 891         cpu_pause();
 892 }
 893
 894
 895 /*
 896  * compute the deadline to spin against when
 897  * waiting for a change of state on a lck_rw_t
 898  */
 899 static inline uint64_t
 900 lck_rw_deadline_for_spin(lck_rw_t *lck)
 901 {
 902         if (lck->lck_rw_can_sleep) {
 903                 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
 904                         /*
 905                          * there are already threads waiting on this lock... this
 906                          * implies that they have spun beyond their deadlines waiting for
 907                          * the desired state to show up so we will not bother spinning at this time...
 908                          *   or
 909                          * the current number of threads sharing this lock exceeds our capacity to run them
 910                          * concurrently and since all states we're going to spin for require the rw_shared_count
 911                          * to be at 0, we'll not bother spinning since the latency for this to happen is
 912                          * unpredictable...
 913                          */
 914                         return (mach_absolute_time());
 915                 }
 916                 return (mach_absolute_time() + MutexSpin);
 917         } else
 918                 return (mach_absolute_time() + (100000LL * 1000000000LL));
 919 }
 920
 921
 922 /*
 923  *      Routine:        lck_rw_lock_exclusive
 924  */
 925 void
 926 lck_rw_lock_exclusive_gen(
 927         lck_rw_t        *lck)
 928 {
 929         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
 930         uint64_t        deadline = 0;
 931         int             slept = 0;
 932         int             gotlock = 0;
 933         int             lockheld = 0;
 934         wait_result_t   res = 0;
 935         boolean_t       istate = -1;
 936
 937 #if     CONFIG_DTRACE
 938         boolean_t dtrace_ls_initialized = FALSE;
 939         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
 940         uint64_t wait_interval = 0;
 941         int readers_at_sleep = 0;
 942 #endif
 943
 944         /*
 945          *      Try to acquire the lck_rw_want_write bit.
 946          */
 947         while ( !lck_rw_grab_want(lck)) {
 948
 949 #if     CONFIG_DTRACE
 950                 if (dtrace_ls_initialized == FALSE) {
 951                         dtrace_ls_initialized = TRUE;
 952                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
 953                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
 954                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
 955                         if (dtrace_ls_enabled) {
 956                                 /*
 957                                  * Either sleeping or spinning is happening,
 958                                  *  start a timing of our delay interval now.
 959                                  */
 960                                 readers_at_sleep = lck->lck_rw_shared_count;
 961                                 wait_interval = mach_absolute_time();
 962                         }
 963                 }
 964 #endif
 965                 if (istate == -1)
 966                         istate = ml_get_interrupts_enabled();
 967
 968                 deadline = lck_rw_deadline_for_spin(lck);
 969
 970                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
 971
 972                 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
 973                         lck_rw_lock_pause(istate);
 974
 975                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
 976
 977                 if (gotlock)
 978                         break;
 979                 /*
 980                  * if we get here, the deadline has expired w/o us
 981                  * being able to grab the lock exclusively
 982                  * check to see if we're allowed to do a thread_block
 983                  */
 984                 if (lck->lck_rw_can_sleep) {
 985
 986                         istate = lck_interlock_lock(lck);
 987
 988                         if (lck->lck_rw_want_write) {
 989
 990                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
 991
 992                                 lck->lck_w_waiting = TRUE;
 993
 994                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
 995                                 lck_interlock_unlock(lck, istate);
 996
 997                                 if (res == THREAD_WAITING) {
 998                                         res = thread_block(THREAD_CONTINUE_NULL);
 999                                         slept++;
1000                                 }
1001                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1002                         } else {
1003                                 lck->lck_rw_want_write = TRUE;
1004                                 lck_interlock_unlock(lck, istate);
1005                                 break;
1006                         }
1007                 }
1008         }
1009         /*
1010          * Wait for readers (and upgrades) to finish...
1011          * the test for these conditions must be done simultaneously with
1012          * a check of the interlock not being held since
1013          * the rw_shared_count will drop to 0 first and then want_upgrade
1014          * will be set to 1 in the shared_to_exclusive scenario... those
1015          * adjustments are done behind the interlock and represent an
1016          * atomic change in state and must be considered as such
1017          * however, once we see the read count at 0, the want_upgrade not set
1018          * and the interlock not held, we are safe to proceed
1019          */
1020         while (lck_rw_held_read_or_upgrade(lck)) {
1021
1022 #if     CONFIG_DTRACE
1023                 /*
1024                  * Either sleeping or spinning is happening, start
1025                  * a timing of our delay interval now.  If we set it
1026                  * to -1 we don't have accurate data so we cannot later
1027                  * decide to record a dtrace spin or sleep event.
1028                  */
1029                 if (dtrace_ls_initialized == FALSE) {
1030                         dtrace_ls_initialized = TRUE;
1031                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1032                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1033                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1034                         if (dtrace_ls_enabled) {
1035                                 /*
1036                                  * Either sleeping or spinning is happening,
1037                                  *  start a timing of our delay interval now.
1038                                  */
1039                                 readers_at_sleep = lck->lck_rw_shared_count;
1040                                 wait_interval = mach_absolute_time();
1041                         }
1042                 }
1043 #endif
1044                 if (istate == -1)
1045                         istate = ml_get_interrupts_enabled();
1046
1047                 deadline = lck_rw_deadline_for_spin(lck);
1048
1049                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1050
1051                 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1052                         lck_rw_lock_pause(istate);
1053
1054                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1055
1056                 if ( !lockheld)
1057                         break;
1058                 /*
1059                  * if we get here, the deadline has expired w/o us
1060                  * being able to grab the lock exclusively
1061                  * check to see if we're allowed to do a thread_block
1062                  */
1063                 if (lck->lck_rw_can_sleep) {
1064
1065                         istate = lck_interlock_lock(lck);
1066
1067                         if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1068                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1069
1070                                 lck->lck_w_waiting = TRUE;
1071
1072                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1073                                 lck_interlock_unlock(lck, istate);
1074
1075                                 if (res == THREAD_WAITING) {
1076                                         res = thread_block(THREAD_CONTINUE_NULL);
1077                                         slept++;
1078                                 }
1079                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1080                         } else {
1081                                 lck_interlock_unlock(lck, istate);
1082                                 /*
1083                                  * must own the lock now, since we checked for
1084                                  * readers or upgrade owner behind the interlock
1085                                  * no need for a call to 'lck_rw_held_read_or_upgrade'
1086                                  */
1087                                 break;
1088                         }
1089                 }
1090         }
1091
1092 #if     CONFIG_DTRACE
1093         /*
1094          * Decide what latencies we suffered that are Dtrace events.
1095          * If we have set wait_interval, then we either spun or slept.
1096          * At least we get out from under the interlock before we record
1097          * which is the best we can do here to minimize the impact
1098          * of the tracing.
1099          * If we have set wait_interval to -1, then dtrace was not enabled when we
1100          * started sleeping/spinning so we don't record this event.
1101          */
1102         if (dtrace_ls_enabled == TRUE) {
1103                 if (slept == 0) {
1104                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1105                             mach_absolute_time() - wait_interval, 1);
1106                 } else {
1107                         /*
1108                          * For the blocking case, we also record if when we blocked
1109                          * it was held for read or write, and how many readers.
1110                          * Notice that above we recorded this before we dropped
1111                          * the interlock so the count is accurate.
1112                          */
1113                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1114                             mach_absolute_time() - wait_interval, 1,
1115                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1116                 }
1117         }
1118         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1119 #endif
1120 }
1121
1122
1123 /*
1124  *      Routine:        lck_rw_done_gen
1125  *
1126  *      called from the assembly language wrapper...
1127  *      prior_lock_state is the value in the 1st
1128  *      word of the lock at the time of a successful
1129  *      atomic compare and exchange with the new value...
1130  *      it represents the state of the lock before we
1131  *      decremented the rw_shared_count or cleared either
1132  *      rw_want_upgrade or rw_want_write and
1133  *      the lck_x_waiting bits...  since the wrapper
1134  *      routine has already changed the state atomically,
1135  *      we just need to decide if we should
1136  *      wake up anyone and what value to return... we do
1137  *      this by examining the state of the lock before
1138  *      we changed it
1139  */
1140 lck_rw_type_t
1141 lck_rw_done_gen(
1142         lck_rw_t        *lck,
1143         int             prior_lock_state)
1144 {
1145         lck_rw_t        *fake_lck;
1146         lck_rw_type_t   lock_type;
1147         thread_t        thread;
1148         uint32_t        rwlock_count;
1149
1150         /*
1151          * prior_lock state is a snapshot of the 1st word of the
1152          * lock in question... we'll fake up a pointer to it
1153          * and carefully not access anything beyond whats defined
1154          * in the first word of a lck_rw_t
1155          */
1156         fake_lck = (lck_rw_t *)&prior_lock_state;
1157
1158         if (fake_lck->lck_rw_shared_count <= 1) {
1159                 if (fake_lck->lck_w_waiting)
1160                         thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1161
1162                 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1163                         thread_wakeup(RW_LOCK_READER_EVENT(lck));
1164         }
1165         if (fake_lck->lck_rw_shared_count)
1166                 lock_type = LCK_RW_TYPE_SHARED;
1167         else
1168                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1169
1170         /* Check if dropping the lock means that we need to unpromote */
1171         thread = current_thread();
1172         rwlock_count = thread->rwlock_count--;
1173 #if MACH_LDEBUG
1174         if (rwlock_count == 0) {
1175                 panic("rw lock count underflow for thread %p", thread);
1176         }
1177 #endif
1178         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1179                 /* sched_flags checked without lock, but will be rechecked while clearing */
1180                 lck_rw_clear_promotion(thread);
1181         }
1182
1183 #if CONFIG_DTRACE
1184         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1185 #endif
1186
1187         return(lock_type);
1188 }
1189
1190
1191 /*
1192  *      Routine:        lck_rw_unlock
1193  */
1194 void
1195 lck_rw_unlock(
1196         lck_rw_t        *lck,
1197         lck_rw_type_t   lck_rw_type)
1198 {
1199         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1200                 lck_rw_unlock_shared(lck);
1201         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1202                 lck_rw_unlock_exclusive(lck);
1203         else
1204                 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1205 }
1206
1207
1208 /*
1209  *      Routine:        lck_rw_unlock_shared
1210  */
1211 void
1212 lck_rw_unlock_shared(
1213         lck_rw_t        *lck)
1214 {
1215         lck_rw_type_t   ret;
1216
1217         ret = lck_rw_done(lck);
1218
1219         if (ret != LCK_RW_TYPE_SHARED)
1220                 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1221 }
1222
1223
1224 /*
1225  *      Routine:        lck_rw_unlock_exclusive
1226  */
1227 void
1228 lck_rw_unlock_exclusive(
1229         lck_rw_t        *lck)
1230 {
1231         lck_rw_type_t   ret;
1232
1233         ret = lck_rw_done(lck);
1234
1235         if (ret != LCK_RW_TYPE_EXCLUSIVE)
1236                 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1237 }
1238
1239
1240 /*
1241  *      Routine:        lck_rw_lock
1242  */
1243 void
1244 lck_rw_lock(
1245         lck_rw_t        *lck,
1246         lck_rw_type_t   lck_rw_type)
1247 {
1248         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1249                 lck_rw_lock_shared(lck);
1250         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1251                 lck_rw_lock_exclusive(lck);
1252         else
1253                 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1254 }
1255
1256
1257 /*
1258  *      Routine:        lck_rw_lock_shared_gen
1259  *      Function:
1260  *              assembly fast path code has determined that this lock
1261  *              is held exclusively... this is where we spin/block
1262  *              until we can acquire the lock in the shared mode
1263  */
1264 void
1265 lck_rw_lock_shared_gen(
1266         lck_rw_t        *lck)
1267 {
1268         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1269         uint64_t        deadline = 0;
1270         int             gotlock = 0;
1271         int             slept = 0;
1272         wait_result_t   res = 0;
1273         boolean_t       istate = -1;
1274
1275 #if     CONFIG_DTRACE
1276         uint64_t wait_interval = 0;
1277         int readers_at_sleep = 0;
1278         boolean_t dtrace_ls_initialized = FALSE;
1279         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1280 #endif
1281
1282         while ( !lck_rw_grab_shared(lck)) {
1283
1284 #if     CONFIG_DTRACE
1285                 if (dtrace_ls_initialized == FALSE) {
1286                         dtrace_ls_initialized = TRUE;
1287                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1288                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1289                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1290                         if (dtrace_ls_enabled) {
1291                                 /*
1292                                  * Either sleeping or spinning is happening,
1293                                  *  start a timing of our delay interval now.
1294                                  */
1295                                 readers_at_sleep = lck->lck_rw_shared_count;
1296                                 wait_interval = mach_absolute_time();
1297                         }
1298                 }
1299 #endif
1300                 if (istate == -1)
1301                         istate = ml_get_interrupts_enabled();
1302
1303                 deadline = lck_rw_deadline_for_spin(lck);
1304
1305                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1306                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1307
1308                 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1309                         lck_rw_lock_pause(istate);
1310
1311                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1312                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1313
1314                 if (gotlock)
1315                         break;
1316                 /*
1317                  * if we get here, the deadline has expired w/o us
1318                  * being able to grab the lock for read
1319                  * check to see if we're allowed to do a thread_block
1320                  */
1321                 if (lck->lck_rw_can_sleep) {
1322
1323                         istate = lck_interlock_lock(lck);
1324
1325                         if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1326                             ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1327
1328                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1329                                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1330
1331                                 lck->lck_r_waiting = TRUE;
1332
1333                                 res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1334                                 lck_interlock_unlock(lck, istate);
1335
1336                                 if (res == THREAD_WAITING) {
1337                                         res = thread_block(THREAD_CONTINUE_NULL);
1338                                         slept++;
1339                                 }
1340                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1341                                              trace_lck, res, slept, 0, 0);
1342                         } else {
1343                                 lck->lck_rw_shared_count++;
1344                                 lck_interlock_unlock(lck, istate);
1345                                 break;
1346                         }
1347                 }
1348         }
1349
1350 #if     CONFIG_DTRACE
1351         if (dtrace_ls_enabled == TRUE) {
1352                 if (slept == 0) {
1353                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1354                 } else {
1355                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1356                             mach_absolute_time() - wait_interval, 0,
1357                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1358                 }
1359         }
1360         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1361 #endif
1362 }
1363
1364
1365 /*
1366  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1367  *      Function:
1368  *              assembly fast path code has already dropped our read
1369  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1370  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1371  *              all we need to do here is determine if a wakeup is needed
1372  */
1373 boolean_t
1374 lck_rw_lock_shared_to_exclusive_failure(
1375         lck_rw_t        *lck,
1376         int             prior_lock_state)
1377 {
1378         lck_rw_t        *fake_lck;
1379         thread_t        thread = current_thread();
1380         uint32_t        rwlock_count;
1381
1382         /* Check if dropping the lock means that we need to unpromote */
1383         rwlock_count = thread->rwlock_count--;
1384 #if MACH_LDEBUG
1385         if (rwlock_count == 0) {
1386                 panic("rw lock count underflow for thread %p", thread);
1387         }
1388 #endif
1389         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1390                 /* sched_flags checked without lock, but will be rechecked while clearing */
1391                 lck_rw_clear_promotion(thread);
1392         }
1393
1394         /*
1395          * prior_lock state is a snapshot of the 1st word of the
1396          * lock in question... we'll fake up a pointer to it
1397          * and carefully not access anything beyond whats defined
1398          * in the first word of a lck_rw_t
1399          */
1400         fake_lck = (lck_rw_t *)&prior_lock_state;
1401
1402         if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1403                 /*
1404                  *      Someone else has requested upgrade.
1405                  *      Since we've released the read lock, wake
1406                  *      him up if he's blocked waiting
1407                  */
1408                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1409         }
1410         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1411                      VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1412
1413         return (FALSE);
1414 }
1415
1416
1417 /*
1418  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1419  *      Function:
1420  *              assembly fast path code has already dropped our read
1421  *              count and successfully acquired 'lck_rw_want_upgrade'
1422  *              we just need to wait for the rest of the readers to drain
1423  *              and then we can return as the exclusive holder of this lock
1424  */
1425 boolean_t
1426 lck_rw_lock_shared_to_exclusive_success(
1427         lck_rw_t        *lck)
1428 {
1429         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1430         uint64_t        deadline = 0;
1431         int             slept = 0;
1432         int             still_shared = 0;
1433         wait_result_t   res;
1434         boolean_t       istate = -1;
1435
1436 #if     CONFIG_DTRACE
1437         uint64_t wait_interval = 0;
1438         int readers_at_sleep = 0;
1439         boolean_t dtrace_ls_initialized = FALSE;
1440         boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1441 #endif
1442
1443         while (lck->lck_rw_shared_count != 0) {
1444
1445 #if     CONFIG_DTRACE
1446                 if (dtrace_ls_initialized == FALSE) {
1447                         dtrace_ls_initialized = TRUE;
1448                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1449                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1450                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1451                         if (dtrace_ls_enabled) {
1452                                 /*
1453                                  * Either sleeping or spinning is happening,
1454                                  *  start a timing of our delay interval now.
1455                                  */
1456                                 readers_at_sleep = lck->lck_rw_shared_count;
1457                                 wait_interval = mach_absolute_time();
1458                         }
1459                 }
1460 #endif
1461                 if (istate == -1)
1462                         istate = ml_get_interrupts_enabled();
1463
1464                 deadline = lck_rw_deadline_for_spin(lck);
1465
1466                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1467                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1468
1469                 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1470                         lck_rw_lock_pause(istate);
1471
1472                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1473                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1474
1475                 if ( !still_shared)
1476                         break;
1477                 /*
1478                  * if we get here, the deadline has expired w/o
1479                  * the rw_shared_count having drained to 0
1480                  * check to see if we're allowed to do a thread_block
1481                  */
1482                 if (lck->lck_rw_can_sleep) {
1483
1484                         istate = lck_interlock_lock(lck);
1485
1486                         if (lck->lck_rw_shared_count != 0) {
1487                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1488                                              trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1489
1490                                 lck->lck_w_waiting = TRUE;
1491
1492                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1493                                 lck_interlock_unlock(lck, istate);
1494
1495                                 if (res == THREAD_WAITING) {
1496                                         res = thread_block(THREAD_CONTINUE_NULL);
1497                                         slept++;
1498                                 }
1499                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1500                                              trace_lck, res, slept, 0, 0);
1501                         } else {
1502                                 lck_interlock_unlock(lck, istate);
1503                                 break;
1504                         }
1505                 }
1506         }
1507 #if     CONFIG_DTRACE
1508         /*
1509          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1510          */
1511         if (dtrace_ls_enabled == TRUE) {
1512                 if (slept == 0) {
1513                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1514                 } else {
1515                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1516                             mach_absolute_time() - wait_interval, 1,
1517                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1518                 }
1519         }
1520         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1521 #endif
1522         return (TRUE);
1523 }
1524
1525
1526 /*
1527  *      Routine:        lck_rw_lock_exclusive_to_shared
1528  *      Function:
1529  *              assembly fast path has already dropped
1530  *              our exclusive state and bumped lck_rw_shared_count
1531  *              all we need to do here is determine if anyone
1532  *              needs to be awakened.
1533  */
1534 void
1535 lck_rw_lock_exclusive_to_shared_gen(
1536         lck_rw_t        *lck,
1537         int             prior_lock_state)
1538 {
1539         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1540         lck_rw_t                *fake_lck;
1541
1542         /*
1543          * prior_lock state is a snapshot of the 1st word of the
1544          * lock in question... we'll fake up a pointer to it
1545          * and carefully not access anything beyond whats defined
1546          * in the first word of a lck_rw_t
1547          */
1548         fake_lck = (lck_rw_t *)&prior_lock_state;
1549
1550         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1551                              trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1552
1553         /*
1554          * don't wake up anyone waiting to take the lock exclusively
1555          * since we hold a read count... when the read count drops to 0,
1556          * the writers will be woken.
1557          *
1558          * wake up any waiting readers if we don't have any writers waiting,
1559          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1560          */
1561         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1562                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1563
1564         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1565                              trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1566
1567 #if CONFIG_DTRACE
1568         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1569 #endif
1570 }
1571
1572
1573 /*
1574  *      Routine:        lck_rw_try_lock
1575  */
1576 boolean_t
1577 lck_rw_try_lock(
1578         lck_rw_t        *lck,
1579         lck_rw_type_t   lck_rw_type)
1580 {
1581         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1582                 return(lck_rw_try_lock_shared(lck));
1583         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1584                 return(lck_rw_try_lock_exclusive(lck));
1585         else
1586                 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1587         return(FALSE);
1588 }
1589
1590
1591 void
1592 lck_rw_assert(
1593         lck_rw_t        *lck,
1594         unsigned int    type)
1595 {
1596         switch (type) {
1597         case LCK_RW_ASSERT_SHARED:
1598                 if (lck->lck_rw_shared_count != 0) {
1599                         return;
1600                 }
1601                 break;
1602         case LCK_RW_ASSERT_EXCLUSIVE:
1603                 if ((lck->lck_rw_want_write ||
1604                      lck->lck_rw_want_upgrade) &&
1605                     lck->lck_rw_shared_count == 0) {
1606                         return;
1607                 }
1608                 break;
1609         case LCK_RW_ASSERT_HELD:
1610                 if (lck->lck_rw_want_write ||
1611                     lck->lck_rw_want_upgrade ||
1612                     lck->lck_rw_shared_count != 0) {
1613                         return;
1614                 }
1615                 break;
1616         case LCK_RW_ASSERT_NOTHELD:
1617                 if (!(lck->lck_rw_want_write ||
1618                           lck->lck_rw_want_upgrade ||
1619                           lck->lck_rw_shared_count != 0)) {
1620                         return;
1621                 }
1622                 break;
1623         default:
1624                 break;
1625         }
1626
1627         panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1628 }
1629
1630 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1631 void
1632 lck_rw_clear_promotions_x86(thread_t thread)
1633 {
1634 #if MACH_LDEBUG
1635         /* It's fatal to leave a RW lock locked and return to userspace */
1636         panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1637 #else
1638         /* Paper over the issue */
1639         thread->rwlock_count = 0;
1640         lck_rw_clear_promotion(thread);
1641 #endif
1642 }
1643
1644
1645 /*
1646  * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1647  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1648  */
1649 boolean_t
1650 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
1651         if (not_in_kdp) {
1652                 panic("panic: rw lock exclusive check done outside of kernel debugger");
1653         }
1654         return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
1655 }
1656
1657
1658 #ifdef  MUTEX_ZONE
1659 extern zone_t lck_mtx_zone;
1660 #endif
1661 /*
1662  *      Routine:        lck_mtx_alloc_init
1663  */
1664 lck_mtx_t *
1665 lck_mtx_alloc_init(
1666         lck_grp_t       *grp,
1667         lck_attr_t      *attr)
1668 {
1669         lck_mtx_t       *lck;
1670 #ifdef  MUTEX_ZONE
1671         if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
1672                 lck_mtx_init(lck, grp, attr);
1673 #else
1674         if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
1675                 lck_mtx_init(lck, grp, attr);
1676 #endif
1677         return(lck);
1678 }
1679
1680 /*
1681  *      Routine:        lck_mtx_free
1682  */
1683 void
1684 lck_mtx_free(
1685         lck_mtx_t       *lck,
1686         lck_grp_t       *grp)
1687 {
1688         lck_mtx_destroy(lck, grp);
1689 #ifdef  MUTEX_ZONE
1690         zfree(lck_mtx_zone, lck);
1691 #else
1692         kfree(lck, sizeof(lck_mtx_t));
1693 #endif
1694 }
1695
1696 /*
1697  *      Routine:        lck_mtx_ext_init
1698  */
1699 static void
1700 lck_mtx_ext_init(
1701         lck_mtx_ext_t   *lck,
1702         lck_grp_t       *grp,
1703         lck_attr_t      *attr)
1704 {
1705         bzero((void *)lck, sizeof(lck_mtx_ext_t));
1706
1707         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1708                 lck->lck_mtx_deb.type = MUTEX_TAG;
1709                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
1710         }
1711
1712         lck->lck_mtx_grp = grp;
1713
1714         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
1715                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
1716
1717         lck->lck_mtx.lck_mtx_is_ext = 1;
1718         lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
1719 }
1720
1721 /*
1722  *      Routine:        lck_mtx_init
1723  */
1724 void
1725 lck_mtx_init(
1726         lck_mtx_t       *lck,
1727         lck_grp_t       *grp,
1728         lck_attr_t      *attr)
1729 {
1730         lck_mtx_ext_t   *lck_ext;
1731         lck_attr_t      *lck_attr;
1732
1733         if (attr != LCK_ATTR_NULL)
1734                 lck_attr = attr;
1735         else
1736                 lck_attr = &LockDefaultLckAttr;
1737
1738         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1739                 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
1740                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
1741                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1742                         lck->lck_mtx_ptr = lck_ext;
1743                 }
1744         } else {
1745                 lck->lck_mtx_owner = 0;
1746                 lck->lck_mtx_state = 0;
1747         }
1748         lck->lck_mtx_pad32 = 0xFFFFFFFF;
1749         lck_grp_reference(grp);
1750         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1751 }
1752
1753 /*
1754  *      Routine:        lck_mtx_init_ext
1755  */
1756 void
1757 lck_mtx_init_ext(
1758         lck_mtx_t       *lck,
1759         lck_mtx_ext_t   *lck_ext,
1760         lck_grp_t       *grp,
1761         lck_attr_t      *attr)
1762 {
1763         lck_attr_t      *lck_attr;
1764
1765         if (attr != LCK_ATTR_NULL)
1766                 lck_attr = attr;
1767         else
1768                 lck_attr = &LockDefaultLckAttr;
1769
1770         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1771                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
1772                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1773                 lck->lck_mtx_ptr = lck_ext;
1774         } else {
1775                 lck->lck_mtx_owner = 0;
1776                 lck->lck_mtx_state = 0;
1777         }
1778         lck->lck_mtx_pad32 = 0xFFFFFFFF;
1779
1780         lck_grp_reference(grp);
1781         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1782 }
1783
1784 /*
1785  *      Routine:        lck_mtx_destroy
1786  */
1787 void
1788 lck_mtx_destroy(
1789         lck_mtx_t       *lck,
1790         lck_grp_t       *grp)
1791 {
1792         boolean_t lck_is_indirect;
1793
1794         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
1795                 return;
1796 #if MACH_LDEBUG
1797         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1798 #endif
1799         lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1800
1801         lck_mtx_lock_mark_destroyed(lck);
1802
1803         if (lck_is_indirect)
1804                 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
1805         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1806         lck_grp_deallocate(grp);
1807         return;
1808 }
1809
1810
1811 #define LCK_MTX_LCK_WAIT_CODE           0x20
1812 #define LCK_MTX_LCK_WAKEUP_CODE         0x21
1813 #define LCK_MTX_LCK_SPIN_CODE           0x22
1814 #define LCK_MTX_LCK_ACQUIRE_CODE        0x23
1815 #define LCK_MTX_LCK_DEMOTE_CODE         0x24
1816
1817
1818 /*
1819  * Routine:     lck_mtx_unlock_wakeup_x86
1820  *
1821  * Invoked on unlock when there is
1822  * contention (i.e. the assembly routine sees that
1823  * that mutex->lck_mtx_waiters != 0 or
1824  * that mutex->lck_mtx_promoted != 0...
1825  *
1826  * neither the mutex or interlock is held
1827  */
1828 void
1829 lck_mtx_unlock_wakeup_x86 (
1830         lck_mtx_t       *mutex,
1831         int             prior_lock_state)
1832 {
1833         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1834         lck_mtx_t               fake_lck;
1835
1836         /*
1837          * prior_lock state is a snapshot of the 2nd word of the
1838          * lock in question... we'll fake up a lock with the bits
1839          * copied into place and carefully not access anything
1840          * beyond whats defined in the second word of a lck_mtx_t
1841          */
1842         fake_lck.lck_mtx_state = prior_lock_state;
1843
1844         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
1845                      trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
1846
1847         if (__probable(fake_lck.lck_mtx_waiters)) {
1848                 if (fake_lck.lck_mtx_waiters > 1)
1849                         thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
1850                 else
1851                         thread_wakeup_one(LCK_MTX_EVENT(mutex));
1852         }
1853
1854         if (__improbable(fake_lck.lck_mtx_promoted)) {
1855                 thread_t        thread = current_thread();
1856
1857
1858                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
1859                              thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
1860
1861                 if (thread->promotions > 0) {
1862                         spl_t   s = splsched();
1863
1864                         thread_lock(thread);
1865
1866                         if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
1867
1868                                 thread->sched_flags &= ~TH_SFLAG_PROMOTED;
1869
1870                                 if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
1871                                         /* Thread still has a RW lock promotion */
1872                                 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1873                                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1874                                                               thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
1875
1876                                         set_sched_pri(thread, DEPRESSPRI);
1877                                 }
1878                                 else {
1879                                         if (thread->base_pri < thread->sched_pri) {
1880                                                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1881                                                                       thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
1882
1883                                                 thread_recompute_sched_pri(thread, FALSE);
1884                                         }
1885                                 }
1886                         }
1887                         thread_unlock(thread);
1888                         splx(s);
1889                 }
1890         }
1891         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
1892                      trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1893 }
1894
1895
1896 /*
1897  * Routine:     lck_mtx_lock_acquire_x86
1898  *
1899  * Invoked on acquiring the mutex when there is
1900  * contention (i.e. the assembly routine sees that
1901  * that mutex->lck_mtx_waiters != 0 or
1902  * thread->was_promoted_on_wakeup != 0)...
1903  *
1904  * mutex is owned...  interlock is held... preemption is disabled
1905  */
1906 void
1907 lck_mtx_lock_acquire_x86(
1908         lck_mtx_t       *mutex)
1909 {
1910         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1911         thread_t                thread;
1912         integer_t               priority;
1913         spl_t                   s;
1914
1915         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
1916                      trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
1917
1918         if (mutex->lck_mtx_waiters)
1919                 priority = mutex->lck_mtx_pri;
1920         else
1921                 priority = 0;
1922
1923         thread = (thread_t)mutex->lck_mtx_owner;        /* faster then current_thread() */
1924
1925         if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
1926
1927                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
1928                                       thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0);
1929
1930                 s = splsched();
1931                 thread_lock(thread);
1932
1933                 if (thread->sched_pri < priority) {
1934                         /* Do not promote past promotion ceiling */
1935                         assert(priority <= MAXPRI_PROMOTE);
1936                         set_sched_pri(thread, priority);
1937                 }
1938                 if (mutex->lck_mtx_promoted == 0) {
1939                         mutex->lck_mtx_promoted = 1;
1940
1941                         thread->promotions++;
1942                         thread->sched_flags |= TH_SFLAG_PROMOTED;
1943                 }
1944                 thread->was_promoted_on_wakeup = 0;
1945
1946                 thread_unlock(thread);
1947                 splx(s);
1948         }
1949         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
1950                      trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1951 }
1952
1953
1954 static int
1955 lck_mtx_interlock_try_lock(lck_mtx_t *mutex, boolean_t *istate)
1956 {
1957         int             retval;
1958
1959         *istate = ml_set_interrupts_enabled(FALSE);
1960         retval = lck_mtx_ilk_try_lock(mutex);
1961
1962         if (retval == 0)
1963                 ml_set_interrupts_enabled(*istate);
1964
1965         return retval;
1966 }
1967
1968 static void
1969 lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
1970 {
1971         lck_mtx_ilk_unlock(mutex);
1972         ml_set_interrupts_enabled(istate);
1973 }
1974
1975
1976 /*
1977  * Routine:     lck_mtx_lock_spinwait_x86
1978  *
1979  * Invoked trying to acquire a mutex when there is contention but
1980  * the holder is running on another processor. We spin for up to a maximum
1981  * time waiting for the lock to be released.
1982  *
1983  * Called with the interlock unlocked.
1984  * returns 0 if mutex acquired
1985  * returns 1 if we spun
1986  * returns 2 if we didn't spin due to the holder not running
1987  */
1988 int
1989 lck_mtx_lock_spinwait_x86(
1990         lck_mtx_t       *mutex)
1991 {
1992         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1993         thread_t        holder;
1994         uint64_t        overall_deadline;
1995         uint64_t        check_owner_deadline;
1996         uint64_t        cur_time;
1997         int             retval = 1;
1998         int             loopcount = 0;
1999
2000         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2001                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
2002
2003         cur_time = mach_absolute_time();
2004         overall_deadline = cur_time + MutexSpin;
2005         check_owner_deadline = cur_time;
2006
2007         /*
2008          * Spin while:
2009          *   - mutex is locked, and
2010          *   - its locked as a spin lock, and
2011          *   - owner is running on another processor, and
2012          *   - owner (processor) is not idling, and
2013          *   - we haven't spun for long enough.
2014          */
2015         do {
2016                 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
2017                         retval = 0;
2018                         break;
2019                 }
2020                 cur_time = mach_absolute_time();
2021
2022                 if (cur_time >= overall_deadline)
2023                         break;
2024
2025                 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
2026                         boolean_t       istate;
2027
2028                         if (lck_mtx_interlock_try_lock(mutex, &istate)) {
2029
2030                                 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
2031
2032                                         if ( !(holder->machine.specFlags & OnProc) ||
2033                                              (holder->state & TH_IDLE)) {
2034
2035                                                 lck_mtx_interlock_unlock(mutex, istate);
2036
2037                                                 if (loopcount == 0)
2038                                                         retval = 2;
2039                                                 break;
2040                                         }
2041                                 }
2042                                 lck_mtx_interlock_unlock(mutex, istate);
2043
2044                                 check_owner_deadline = cur_time + (MutexSpin / 4);
2045                         }
2046                 }
2047                 cpu_pause();
2048
2049                 loopcount++;
2050
2051         } while (TRUE);
2052
2053 #if     CONFIG_DTRACE
2054         /*
2055          * We've already kept a count via overall_deadline of how long we spun.
2056          * If dtrace is active, then we compute backwards to decide how
2057          * long we spun.
2058          *
2059          * Note that we record a different probe id depending on whether
2060          * this is a direct or indirect mutex.  This allows us to
2061          * penalize only lock groups that have debug/stats enabled
2062          * with dtrace processing if desired.
2063          */
2064         if (__probable(mutex->lck_mtx_is_ext == 0)) {
2065                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
2066                         mach_absolute_time() - (overall_deadline - MutexSpin));
2067         } else {
2068                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
2069                         mach_absolute_time() - (overall_deadline - MutexSpin));
2070         }
2071         /* The lockstat acquire event is recorded by the assembly code beneath us. */
2072 #endif
2073
2074         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2075                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
2076
2077         return retval;
2078 }
2079
2080
2081
2082 /*
2083  * Routine:     lck_mtx_lock_wait_x86
2084  *
2085  * Invoked in order to wait on contention.
2086  *
2087  * Called with the interlock locked and
2088  * preemption disabled...
2089  * returns it unlocked and with preemption enabled
2090  */
2091 void
2092 lck_mtx_lock_wait_x86 (
2093         lck_mtx_t       *mutex)
2094 {
2095         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2096         thread_t        self = current_thread();
2097         thread_t        holder;
2098         integer_t       priority;
2099         spl_t           s;
2100 #if     CONFIG_DTRACE
2101         uint64_t        sleep_start = 0;
2102
2103         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
2104                 sleep_start = mach_absolute_time();
2105         }
2106 #endif
2107         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
2108                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2109
2110         priority = self->sched_pri;
2111
2112         if (priority < self->base_pri)
2113                 priority = self->base_pri;
2114         if (priority < BASEPRI_DEFAULT)
2115                 priority = BASEPRI_DEFAULT;
2116
2117         /* Do not promote past promotion ceiling */
2118         priority = MIN(priority, MAXPRI_PROMOTE);
2119
2120         if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
2121                 mutex->lck_mtx_pri = priority;
2122         mutex->lck_mtx_waiters++;
2123
2124         if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
2125              holder->sched_pri < mutex->lck_mtx_pri ) {
2126                 s = splsched();
2127                 thread_lock(holder);
2128
2129                 /* holder priority may have been bumped by another thread
2130                  * before thread_lock was taken
2131                  */
2132                 if (holder->sched_pri < mutex->lck_mtx_pri) {
2133                         KERNEL_DEBUG_CONSTANT(
2134                                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2135                                 holder->sched_pri, priority, thread_tid(holder), trace_lck, 0);
2136                         /* Assert that we're not altering the priority of a
2137                          * thread above the MAXPRI_PROMOTE band
2138                          */
2139                         assert(holder->sched_pri < MAXPRI_PROMOTE);
2140                         set_sched_pri(holder, priority);
2141
2142                         if (mutex->lck_mtx_promoted == 0) {
2143                                 holder->promotions++;
2144                                 holder->sched_flags |= TH_SFLAG_PROMOTED;
2145
2146                                 mutex->lck_mtx_promoted = 1;
2147                         }
2148                 }
2149                 thread_unlock(holder);
2150                 splx(s);
2151         }
2152         assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
2153
2154         lck_mtx_ilk_unlock(mutex);
2155
2156         thread_block(THREAD_CONTINUE_NULL);
2157
2158         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2159                      trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2160
2161 #if     CONFIG_DTRACE
2162         /*
2163          * Record the Dtrace lockstat probe for blocking, block time
2164          * measured from when we were entered.
2165          */
2166         if (sleep_start) {
2167                 if (mutex->lck_mtx_is_ext == 0) {
2168                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2169                             mach_absolute_time() - sleep_start);
2170                 } else {
2171                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2172                             mach_absolute_time() - sleep_start);
2173                 }
2174         }
2175 #endif
2176 }
2177
2178 /*
2179  *      Routine: kdp_lck_mtx_lock_spin_is_acquired
2180  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2181  *      Returns: TRUE if lock is acquired.
2182  */
2183 boolean_t
2184 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t     *lck)
2185 {
2186         if (not_in_kdp) {
2187                 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
2188         }
2189
2190         if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
2191                 return TRUE;
2192         }
2193
2194         return FALSE;
2195 }
2196