osfmk/i386/locks_i386.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  *      File:   kern/lock.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *      Date:   1985
  60  *
  61  *      Locking primitives implementation
  62  */
  63
  64 #include <mach_ldebug.h>
  65
  66 #include <kern/locks.h>
  67 #include <kern/kalloc.h>
  68 #include <kern/misc_protos.h>
  69 #include <kern/thread.h>
  70 #include <kern/processor.h>
  71 #include <kern/cpu_data.h>
  72 #include <kern/cpu_number.h>
  73 #include <kern/sched_prim.h>
  74 #include <kern/xpr.h>
  75 #include <kern/debug.h>
  76 #include <string.h>
  77
  78 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
  79 #include <machine/machine_cpu.h>
  80 #include <i386/mp.h>
  81
  82 #include <sys/kdebug.h>
  83 #include <mach/branch_predicates.h>
  84
  85 /*
  86  * We need only enough declarations from the BSD-side to be able to
  87  * test if our probe is active, and to call __dtrace_probe().  Setting
  88  * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
  89  */
  90 #if     CONFIG_DTRACE
  91 #define NEED_DTRACE_DEFS
  92 #include <../bsd/sys/lockstat.h>
  93 #endif
  94
  95 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
  96 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
  97 #define LCK_RW_LCK_SHARED_CODE          0x102
  98 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
  99 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
 100 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 101
 102 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
 103 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
 104 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
 105 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
 106 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
 107 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
 108 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
 109 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
 110
 111
 112 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 113
 114 unsigned int LcksOpts=0;
 115
 116 /* Forwards */
 117
 118 #if     USLOCK_DEBUG
 119 /*
 120  *      Perform simple lock checks.
 121  */
 122 int     uslock_check = 1;
 123 int     max_lock_loops  = 100000000;
 124 decl_simple_lock_data(extern , printf_lock)
 125 decl_simple_lock_data(extern , panic_lock)
 126 #endif  /* USLOCK_DEBUG */
 127
 128 extern unsigned int not_in_kdp;
 129
 130 /*
 131  *      We often want to know the addresses of the callers
 132  *      of the various lock routines.  However, this information
 133  *      is only used for debugging and statistics.
 134  */
 135 typedef void    *pc_t;
 136 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 137 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 138 #if     ANY_LOCK_DEBUG
 139 #define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
 140 #define DECL_PC(pc)     pc_t pc;
 141 #else   /* ANY_LOCK_DEBUG */
 142 #define DECL_PC(pc)
 143 #ifdef  lint
 144 /*
 145  *      Eliminate lint complaints about unused local pc variables.
 146  */
 147 #define OBTAIN_PC(pc)   ++pc
 148 #else   /* lint */
 149 #define OBTAIN_PC(pc)
 150 #endif  /* lint */
 151 #endif  /* USLOCK_DEBUG */
 152
 153
 154 /*
 155  *      Portable lock package implementation of usimple_locks.
 156  */
 157
 158 #if     USLOCK_DEBUG
 159 #define USLDBG(stmt)    stmt
 160 void            usld_lock_init(usimple_lock_t, unsigned short);
 161 void            usld_lock_pre(usimple_lock_t, pc_t);
 162 void            usld_lock_post(usimple_lock_t, pc_t);
 163 void            usld_unlock(usimple_lock_t, pc_t);
 164 void            usld_lock_try_pre(usimple_lock_t, pc_t);
 165 void            usld_lock_try_post(usimple_lock_t, pc_t);
 166 int             usld_lock_common_checks(usimple_lock_t, char *);
 167 #else   /* USLOCK_DEBUG */
 168 #define USLDBG(stmt)
 169 #endif  /* USLOCK_DEBUG */
 170
 171
 172 extern int lck_rw_grab_want(lck_rw_t *lck);
 173 extern int lck_rw_grab_shared(lck_rw_t *lck);
 174 extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
 175
 176
 177 /*
 178  * Forward definitions
 179  */
 180
 181 void lck_rw_lock_shared_gen(
 182         lck_rw_t        *lck);
 183
 184 void lck_rw_lock_exclusive_gen(
 185         lck_rw_t        *lck);
 186
 187 boolean_t lck_rw_lock_shared_to_exclusive_success(
 188         lck_rw_t        *lck);
 189
 190 boolean_t lck_rw_lock_shared_to_exclusive_failure(
 191         lck_rw_t        *lck,
 192         int             prior_lock_state);
 193
 194 void lck_rw_lock_exclusive_to_shared_gen(
 195         lck_rw_t        *lck,
 196         int             prior_lock_state);
 197
 198 lck_rw_type_t lck_rw_done_gen(
 199         lck_rw_t        *lck,
 200         int             prior_lock_state);
 201
 202 void lck_rw_clear_promotions_x86(thread_t thread);
 203
 204 /*
 205  *      Routine:        lck_spin_alloc_init
 206  */
 207 lck_spin_t *
 208 lck_spin_alloc_init(
 209         lck_grp_t       *grp,
 210         lck_attr_t      *attr)
 211 {
 212         lck_spin_t      *lck;
 213
 214         if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
 215                 lck_spin_init(lck, grp, attr);
 216
 217         return(lck);
 218 }
 219
 220 /*
 221  *      Routine:        lck_spin_free
 222  */
 223 void
 224 lck_spin_free(
 225         lck_spin_t      *lck,
 226         lck_grp_t       *grp)
 227 {
 228         lck_spin_destroy(lck, grp);
 229         kfree(lck, sizeof(lck_spin_t));
 230 }
 231
 232 /*
 233  *      Routine:        lck_spin_init
 234  */
 235 void
 236 lck_spin_init(
 237         lck_spin_t      *lck,
 238         lck_grp_t       *grp,
 239         __unused lck_attr_t     *attr)
 240 {
 241         usimple_lock_init((usimple_lock_t) lck, 0);
 242         lck_grp_reference(grp);
 243         lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 244 }
 245
 246 /*
 247  *      Routine:        lck_spin_destroy
 248  */
 249 void
 250 lck_spin_destroy(
 251         lck_spin_t      *lck,
 252         lck_grp_t       *grp)
 253 {
 254         if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
 255                 return;
 256         lck->interlock = LCK_SPIN_TAG_DESTROYED;
 257         lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 258         lck_grp_deallocate(grp);
 259         return;
 260 }
 261
 262 /*
 263  *      Routine:        lck_spin_lock
 264  */
 265 void
 266 lck_spin_lock(
 267         lck_spin_t      *lck)
 268 {
 269         usimple_lock((usimple_lock_t) lck);
 270 }
 271
 272 /*
 273  *      Routine:        lck_spin_unlock
 274  */
 275 void
 276 lck_spin_unlock(
 277         lck_spin_t      *lck)
 278 {
 279         usimple_unlock((usimple_lock_t) lck);
 280 }
 281
 282
 283 /*
 284  *      Routine:        lck_spin_try_lock
 285  */
 286 boolean_t
 287 lck_spin_try_lock(
 288         lck_spin_t      *lck)
 289 {
 290         return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
 291 }
 292
 293 /*
 294  *      Routine: lck_spin_is_acquired
 295  *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
 296  *      Returns: TRUE if lock is acquired.
 297  */
 298 boolean_t
 299 lck_spin_is_acquired(lck_spin_t *lck) {
 300         if (not_in_kdp) {
 301                 panic("panic: spinlock acquired check done outside of kernel debugger");
 302         }
 303         return (lck->interlock != 0)? TRUE : FALSE;
 304 }
 305
 306 /*
 307  *      Initialize a usimple_lock.
 308  *
 309  *      No change in preemption state.
 310  */
 311 void
 312 usimple_lock_init(
 313         usimple_lock_t  l,
 314         __unused unsigned short tag)
 315 {
 316 #ifndef MACHINE_SIMPLE_LOCK
 317         USLDBG(usld_lock_init(l, tag));
 318         hw_lock_init(&l->interlock);
 319 #else
 320         simple_lock_init((simple_lock_t)l,tag);
 321 #endif
 322 }
 323
 324 volatile uint32_t spinlock_owner_cpu = ~0;
 325 volatile usimple_lock_t spinlock_timed_out;
 326
 327 uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
 328         uint64_t deadline;
 329         uint32_t i;
 330
 331         for (i = 0; i < real_ncpus; i++) {
 332                 if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
 333                         spinlock_owner_cpu = i;
 334                         if ((uint32_t) cpu_number() == i)
 335                                 break;
 336                         cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
 337                         cpu_NMI_interrupt(i);
 338                         deadline = mach_absolute_time() + (LockTimeOut * 2);
 339                         while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
 340                                 cpu_pause();
 341                         break;
 342                 }
 343         }
 344
 345         return spinlock_owner_cpu;
 346 }
 347
 348 /*
 349  *      Acquire a usimple_lock.
 350  *
 351  *      Returns with preemption disabled.  Note
 352  *      that the hw_lock routines are responsible for
 353  *      maintaining preemption state.
 354  */
 355 void
 356 usimple_lock(
 357         usimple_lock_t  l)
 358 {
 359 #ifndef MACHINE_SIMPLE_LOCK
 360         DECL_PC(pc);
 361
 362         OBTAIN_PC(pc);
 363         USLDBG(usld_lock_pre(l, pc));
 364
 365         if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))        {
 366                 boolean_t uslock_acquired = FALSE;
 367                 while (machine_timeout_suspended()) {
 368                         enable_preemption();
 369                         if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
 370                                 break;
 371                 }
 372
 373                 if (uslock_acquired == FALSE) {
 374                         uint32_t lock_cpu;
 375                         uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
 376                         spinlock_timed_out = l;
 377                         lock_cpu = spinlock_timeout_NMI(lowner);
 378                         panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
 379                 }
 380         }
 381         USLDBG(usld_lock_post(l, pc));
 382 #else
 383         simple_lock((simple_lock_t)l);
 384 #endif
 385 }
 386
 387
 388 /*
 389  *      Release a usimple_lock.
 390  *
 391  *      Returns with preemption enabled.  Note
 392  *      that the hw_lock routines are responsible for
 393  *      maintaining preemption state.
 394  */
 395 void
 396 usimple_unlock(
 397         usimple_lock_t  l)
 398 {
 399 #ifndef MACHINE_SIMPLE_LOCK
 400         DECL_PC(pc);
 401
 402         OBTAIN_PC(pc);
 403         USLDBG(usld_unlock(l, pc));
 404         hw_lock_unlock(&l->interlock);
 405 #else
 406         simple_unlock_rwmb((simple_lock_t)l);
 407 #endif
 408 }
 409
 410
 411 /*
 412  *      Conditionally acquire a usimple_lock.
 413  *
 414  *      On success, returns with preemption disabled.
 415  *      On failure, returns with preemption in the same state
 416  *      as when first invoked.  Note that the hw_lock routines
 417  *      are responsible for maintaining preemption state.
 418  *
 419  *      XXX No stats are gathered on a miss; I preserved this
 420  *      behavior from the original assembly-language code, but
 421  *      doesn't it make sense to log misses?  XXX
 422  */
 423 unsigned int
 424 usimple_lock_try(
 425         usimple_lock_t  l)
 426 {
 427 #ifndef MACHINE_SIMPLE_LOCK
 428         unsigned int    success;
 429         DECL_PC(pc);
 430
 431         OBTAIN_PC(pc);
 432         USLDBG(usld_lock_try_pre(l, pc));
 433         if ((success = hw_lock_try(&l->interlock))) {
 434                 USLDBG(usld_lock_try_post(l, pc));
 435         }
 436         return success;
 437 #else
 438         return(simple_lock_try((simple_lock_t)l));
 439 #endif
 440 }
 441
 442 #if     USLOCK_DEBUG
 443 /*
 444  *      States of a usimple_lock.  The default when initializing
 445  *      a usimple_lock is setting it up for debug checking.
 446  */
 447 #define USLOCK_CHECKED          0x0001          /* lock is being checked */
 448 #define USLOCK_TAKEN            0x0002          /* lock has been taken */
 449 #define USLOCK_INIT             0xBAA0          /* lock has been initialized */
 450 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 451 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 452                                  ((l)->debug.state & USLOCK_CHECKED))
 453
 454 /*
 455  *      Trace activities of a particularly interesting lock.
 456  */
 457 void    usl_trace(usimple_lock_t, int, pc_t, const char *);
 458
 459
 460 /*
 461  *      Initialize the debugging information contained
 462  *      in a usimple_lock.
 463  */
 464 void
 465 usld_lock_init(
 466         usimple_lock_t  l,
 467         __unused unsigned short tag)
 468 {
 469         if (l == USIMPLE_LOCK_NULL)
 470                 panic("lock initialization:  null lock pointer");
 471         l->lock_type = USLOCK_TAG;
 472         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 473         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 474         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 475         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 476         l->debug.duration[0] = l->debug.duration[1] = 0;
 477         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 478         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 479         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 480 }
 481
 482
 483 /*
 484  *      These checks apply to all usimple_locks, not just
 485  *      those with USLOCK_CHECKED turned on.
 486  */
 487 int
 488 usld_lock_common_checks(
 489         usimple_lock_t  l,
 490         char            *caller)
 491 {
 492         if (l == USIMPLE_LOCK_NULL)
 493                 panic("%s:  null lock pointer", caller);
 494         if (l->lock_type != USLOCK_TAG)
 495                 panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
 496         if (!(l->debug.state & USLOCK_INIT))
 497                 panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
 498         return USLOCK_CHECKING(l);
 499 }
 500
 501
 502 /*
 503  *      Debug checks on a usimple_lock just before attempting
 504  *      to acquire it.
 505  */
 506 /* ARGSUSED */
 507 void
 508 usld_lock_pre(
 509         usimple_lock_t  l,
 510         pc_t            pc)
 511 {
 512         char    caller[] = "usimple_lock";
 513
 514
 515         if (!usld_lock_common_checks(l, caller))
 516                 return;
 517
 518 /*
 519  *      Note that we have a weird case where we are getting a lock when we are]
 520  *      in the process of putting the system to sleep. We are running with no
 521  *      current threads, therefore we can't tell if we are trying to retake a lock
 522  *      we have or someone on the other processor has it.  Therefore we just
 523  *      ignore this test if the locking thread is 0.
 524  */
 525
 526         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 527             l->debug.lock_thread == (void *) current_thread()) {
 528                 printf("%s:  lock %p already locked (at %p) by",
 529                       caller, l, l->debug.lock_pc);
 530                 printf(" current thread %p (new attempt at pc %p)\n",
 531                        l->debug.lock_thread, pc);
 532                 panic("%s", caller);
 533         }
 534         mp_disable_preemption();
 535         usl_trace(l, cpu_number(), pc, caller);
 536         mp_enable_preemption();
 537 }
 538
 539
 540 /*
 541  *      Debug checks on a usimple_lock just after acquiring it.
 542  *
 543  *      Pre-emption has been disabled at this point,
 544  *      so we are safe in using cpu_number.
 545  */
 546 void
 547 usld_lock_post(
 548         usimple_lock_t  l,
 549         pc_t            pc)
 550 {
 551         register int    mycpu;
 552         char    caller[] = "successful usimple_lock";
 553
 554
 555         if (!usld_lock_common_checks(l, caller))
 556                 return;
 557
 558         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 559                 panic("%s:  lock %p became uninitialized",
 560                       caller, l);
 561         if ((l->debug.state & USLOCK_TAKEN))
 562                 panic("%s:  lock 0x%p became TAKEN by someone else",
 563                       caller, l);
 564
 565         mycpu = cpu_number();
 566         l->debug.lock_thread = (void *)current_thread();
 567         l->debug.state |= USLOCK_TAKEN;
 568         l->debug.lock_pc = pc;
 569         l->debug.lock_cpu = mycpu;
 570
 571         usl_trace(l, mycpu, pc, caller);
 572 }
 573
 574
 575 /*
 576  *      Debug checks on a usimple_lock just before
 577  *      releasing it.  Note that the caller has not
 578  *      yet released the hardware lock.
 579  *
 580  *      Preemption is still disabled, so there's
 581  *      no problem using cpu_number.
 582  */
 583 void
 584 usld_unlock(
 585         usimple_lock_t  l,
 586         pc_t            pc)
 587 {
 588         register int    mycpu;
 589         char    caller[] = "usimple_unlock";
 590
 591
 592         if (!usld_lock_common_checks(l, caller))
 593                 return;
 594
 595         mycpu = cpu_number();
 596
 597         if (!(l->debug.state & USLOCK_TAKEN))
 598                 panic("%s:  lock 0x%p hasn't been taken",
 599                       caller, l);
 600         if (l->debug.lock_thread != (void *) current_thread())
 601                 panic("%s:  unlocking lock 0x%p, owned by thread %p",
 602                       caller, l, l->debug.lock_thread);
 603         if (l->debug.lock_cpu != mycpu) {
 604                 printf("%s:  unlocking lock 0x%p on cpu 0x%x",
 605                        caller, l, mycpu);
 606                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 607                 panic("%s", caller);
 608         }
 609         usl_trace(l, mycpu, pc, caller);
 610
 611         l->debug.unlock_thread = l->debug.lock_thread;
 612         l->debug.lock_thread = INVALID_PC;
 613         l->debug.state &= ~USLOCK_TAKEN;
 614         l->debug.unlock_pc = pc;
 615         l->debug.unlock_cpu = mycpu;
 616 }
 617
 618
 619 /*
 620  *      Debug checks on a usimple_lock just before
 621  *      attempting to acquire it.
 622  *
 623  *      Preemption isn't guaranteed to be disabled.
 624  */
 625 void
 626 usld_lock_try_pre(
 627         usimple_lock_t  l,
 628         pc_t            pc)
 629 {
 630         char    caller[] = "usimple_lock_try";
 631
 632         if (!usld_lock_common_checks(l, caller))
 633                 return;
 634         mp_disable_preemption();
 635         usl_trace(l, cpu_number(), pc, caller);
 636         mp_enable_preemption();
 637 }
 638
 639
 640 /*
 641  *      Debug checks on a usimple_lock just after
 642  *      successfully attempting to acquire it.
 643  *
 644  *      Preemption has been disabled by the
 645  *      lock acquisition attempt, so it's safe
 646  *      to use cpu_number.
 647  */
 648 void
 649 usld_lock_try_post(
 650         usimple_lock_t  l,
 651         pc_t            pc)
 652 {
 653         register int    mycpu;
 654         char    caller[] = "successful usimple_lock_try";
 655
 656         if (!usld_lock_common_checks(l, caller))
 657                 return;
 658
 659         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 660                 panic("%s:  lock 0x%p became uninitialized",
 661                       caller, l);
 662         if ((l->debug.state & USLOCK_TAKEN))
 663                 panic("%s:  lock 0x%p became TAKEN by someone else",
 664                       caller, l);
 665
 666         mycpu = cpu_number();
 667         l->debug.lock_thread = (void *) current_thread();
 668         l->debug.state |= USLOCK_TAKEN;
 669         l->debug.lock_pc = pc;
 670         l->debug.lock_cpu = mycpu;
 671
 672         usl_trace(l, mycpu, pc, caller);
 673 }
 674
 675
 676 /*
 677  *      For very special cases, set traced_lock to point to a
 678  *      specific lock of interest.  The result is a series of
 679  *      XPRs showing lock operations on that lock.  The lock_seq
 680  *      value is used to show the order of those operations.
 681  */
 682 usimple_lock_t          traced_lock;
 683 unsigned int            lock_seq;
 684
 685 void
 686 usl_trace(
 687         usimple_lock_t  l,
 688         int             mycpu,
 689         pc_t            pc,
 690         const char *    op_name)
 691 {
 692         if (traced_lock == l) {
 693                 XPR(XPR_SLOCK,
 694                     "seq %d, cpu %d, %s @ %x\n",
 695                     (uintptr_t) lock_seq, (uintptr_t) mycpu,
 696                     (uintptr_t) op_name, (uintptr_t) pc, 0);
 697                 lock_seq++;
 698         }
 699 }
 700
 701
 702 #endif  /* USLOCK_DEBUG */
 703
 704 /*
 705  *      Routine:        lck_rw_alloc_init
 706  */
 707 lck_rw_t *
 708 lck_rw_alloc_init(
 709         lck_grp_t       *grp,
 710         lck_attr_t      *attr) {
 711         lck_rw_t        *lck;
 712
 713         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 714                 bzero(lck, sizeof(lck_rw_t));
 715                 lck_rw_init(lck, grp, attr);
 716         }
 717
 718         return(lck);
 719 }
 720
 721 /*
 722  *      Routine:        lck_rw_free
 723  */
 724 void
 725 lck_rw_free(
 726         lck_rw_t        *lck,
 727         lck_grp_t       *grp) {
 728         lck_rw_destroy(lck, grp);
 729         kfree(lck, sizeof(lck_rw_t));
 730 }
 731
 732 /*
 733  *      Routine:        lck_rw_init
 734  */
 735 void
 736 lck_rw_init(
 737         lck_rw_t        *lck,
 738         lck_grp_t       *grp,
 739         lck_attr_t      *attr)
 740 {
 741         lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
 742                                         attr : &LockDefaultLckAttr;
 743
 744         hw_lock_byte_init(&lck->lck_rw_interlock);
 745         lck->lck_rw_want_write = FALSE;
 746         lck->lck_rw_want_upgrade = FALSE;
 747         lck->lck_rw_shared_count = 0;
 748         lck->lck_rw_can_sleep = TRUE;
 749         lck->lck_r_waiting = lck->lck_w_waiting = 0;
 750         lck->lck_rw_tag = 0;
 751         lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
 752                                 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
 753
 754         lck_grp_reference(grp);
 755         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 756 }
 757
 758 /*
 759  *      Routine:        lck_rw_destroy
 760  */
 761 void
 762 lck_rw_destroy(
 763         lck_rw_t        *lck,
 764         lck_grp_t       *grp)
 765 {
 766         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
 767                 return;
 768 #if MACH_LDEBUG
 769         lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
 770 #endif
 771         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 772         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 773         lck_grp_deallocate(grp);
 774         return;
 775 }
 776
 777 /*
 778  *      Sleep locks.  These use the same data structure and algorithm
 779  *      as the spin locks, but the process sleeps while it is waiting
 780  *      for the lock.  These work on uniprocessor systems.
 781  */
 782
 783 #define DECREMENTER_TIMEOUT 1000000
 784
 785 #define RW_LOCK_READER_EVENT(x)         \
 786                 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
 787
 788 #define RW_LOCK_WRITER_EVENT(x)         \
 789                 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
 790
 791 /*
 792  * We disable interrupts while holding the RW interlock to prevent an
 793  * interrupt from exacerbating hold time.
 794  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 795  */
 796 static boolean_t
 797 lck_interlock_lock(lck_rw_t *lck)
 798 {
 799         boolean_t       istate;
 800
 801         istate = ml_set_interrupts_enabled(FALSE);
 802         hw_lock_byte_lock(&lck->lck_rw_interlock);
 803
 804         return istate;
 805 }
 806
 807 static void
 808 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 809 {
 810         hw_lock_byte_unlock(&lck->lck_rw_interlock);
 811         ml_set_interrupts_enabled(istate);
 812 }
 813
 814 /*
 815  * This inline is used when busy-waiting for an rw lock.
 816  * If interrupts were disabled when the lock primitive was called,
 817  * we poll the IPI handler for pending tlb flushes.
 818  * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
 819  */
 820 static inline void
 821 lck_rw_lock_pause(boolean_t interrupts_enabled)
 822 {
 823         if (!interrupts_enabled)
 824                 handle_pending_TLB_flushes();
 825         cpu_pause();
 826 }
 827
 828
 829 /*
 830  * compute the deadline to spin against when
 831  * waiting for a change of state on a lck_rw_t
 832  */
 833 static inline uint64_t
 834 lck_rw_deadline_for_spin(lck_rw_t *lck)
 835 {
 836         if (lck->lck_rw_can_sleep) {
 837                 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
 838                         /*
 839                          * there are already threads waiting on this lock... this
 840                          * implies that they have spun beyond their deadlines waiting for
 841                          * the desired state to show up so we will not bother spinning at this time...
 842                          *   or
 843                          * the current number of threads sharing this lock exceeds our capacity to run them
 844                          * concurrently and since all states we're going to spin for require the rw_shared_count
 845                          * to be at 0, we'll not bother spinning since the latency for this to happen is
 846                          * unpredictable...
 847                          */
 848                         return (mach_absolute_time());
 849                 }
 850                 return (mach_absolute_time() + MutexSpin);
 851         } else
 852                 return (mach_absolute_time() + (100000LL * 1000000000LL));
 853 }
 854
 855
 856 /*
 857  *      Routine:        lck_rw_lock_exclusive
 858  */
 859 void
 860 lck_rw_lock_exclusive_gen(
 861         lck_rw_t        *lck)
 862 {
 863         uint64_t        deadline = 0;
 864         int             slept = 0;
 865         int             gotlock = 0;
 866         int             lockheld = 0;
 867         wait_result_t   res = 0;
 868         boolean_t       istate = -1;
 869
 870 #if     CONFIG_DTRACE
 871         boolean_t dtrace_ls_initialized = FALSE;
 872         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
 873         uint64_t wait_interval = 0;
 874         int readers_at_sleep = 0;
 875 #endif
 876
 877         /*
 878          *      Try to acquire the lck_rw_want_write bit.
 879          */
 880         while ( !lck_rw_grab_want(lck)) {
 881
 882 #if     CONFIG_DTRACE
 883                 if (dtrace_ls_initialized == FALSE) {
 884                         dtrace_ls_initialized = TRUE;
 885                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
 886                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
 887                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
 888                         if (dtrace_ls_enabled) {
 889                                 /*
 890                                  * Either sleeping or spinning is happening,
 891                                  *  start a timing of our delay interval now.
 892                                  */
 893                                 readers_at_sleep = lck->lck_rw_shared_count;
 894                                 wait_interval = mach_absolute_time();
 895                         }
 896                 }
 897 #endif
 898                 if (istate == -1)
 899                         istate = ml_get_interrupts_enabled();
 900
 901                 deadline = lck_rw_deadline_for_spin(lck);
 902
 903                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
 904
 905                 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
 906                         lck_rw_lock_pause(istate);
 907
 908                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0);
 909
 910                 if (gotlock)
 911                         break;
 912                 /*
 913                  * if we get here, the deadline has expired w/o us
 914                  * being able to grab the lock exclusively
 915                  * check to see if we're allowed to do a thread_block
 916                  */
 917                 if (lck->lck_rw_can_sleep) {
 918
 919                         istate = lck_interlock_lock(lck);
 920
 921                         if (lck->lck_rw_want_write) {
 922
 923                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
 924
 925                                 lck->lck_w_waiting = TRUE;
 926
 927                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
 928                                 lck_interlock_unlock(lck, istate);
 929
 930                                 if (res == THREAD_WAITING) {
 931                                         res = thread_block(THREAD_CONTINUE_NULL);
 932                                         slept++;
 933                                 }
 934                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
 935                         } else {
 936                                 lck->lck_rw_want_write = TRUE;
 937                                 lck_interlock_unlock(lck, istate);
 938                                 break;
 939                         }
 940                 }
 941         }
 942         /*
 943          * Wait for readers (and upgrades) to finish...
 944          * the test for these conditions must be done simultaneously with
 945          * a check of the interlock not being held since
 946          * the rw_shared_count will drop to 0 first and then want_upgrade
 947          * will be set to 1 in the shared_to_exclusive scenario... those
 948          * adjustments are done behind the interlock and represent an
 949          * atomic change in state and must be considered as such
 950          * however, once we see the read count at 0, the want_upgrade not set
 951          * and the interlock not held, we are safe to proceed
 952          */
 953         while (lck_rw_held_read_or_upgrade(lck)) {
 954
 955 #if     CONFIG_DTRACE
 956                 /*
 957                  * Either sleeping or spinning is happening, start
 958                  * a timing of our delay interval now.  If we set it
 959                  * to -1 we don't have accurate data so we cannot later
 960                  * decide to record a dtrace spin or sleep event.
 961                  */
 962                 if (dtrace_ls_initialized == FALSE) {
 963                         dtrace_ls_initialized = TRUE;
 964                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
 965                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
 966                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
 967                         if (dtrace_ls_enabled) {
 968                                 /*
 969                                  * Either sleeping or spinning is happening,
 970                                  *  start a timing of our delay interval now.
 971                                  */
 972                                 readers_at_sleep = lck->lck_rw_shared_count;
 973                                 wait_interval = mach_absolute_time();
 974                         }
 975                 }
 976 #endif
 977                 if (istate == -1)
 978                         istate = ml_get_interrupts_enabled();
 979
 980                 deadline = lck_rw_deadline_for_spin(lck);
 981
 982                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
 983
 984                 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
 985                         lck_rw_lock_pause(istate);
 986
 987                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0);
 988
 989                 if ( !lockheld)
 990                         break;
 991                 /*
 992                  * if we get here, the deadline has expired w/o us
 993                  * being able to grab the lock exclusively
 994                  * check to see if we're allowed to do a thread_block
 995                  */
 996                 if (lck->lck_rw_can_sleep) {
 997
 998                         istate = lck_interlock_lock(lck);
 999
1000                         if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1001                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1002
1003                                 lck->lck_w_waiting = TRUE;
1004
1005                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1006                                 lck_interlock_unlock(lck, istate);
1007
1008                                 if (res == THREAD_WAITING) {
1009                                         res = thread_block(THREAD_CONTINUE_NULL);
1010                                         slept++;
1011                                 }
1012                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
1013                         } else {
1014                                 lck_interlock_unlock(lck, istate);
1015                                 /*
1016                                  * must own the lock now, since we checked for
1017                                  * readers or upgrade owner behind the interlock
1018                                  * no need for a call to 'lck_rw_held_read_or_upgrade'
1019                                  */
1020                                 break;
1021                         }
1022                 }
1023         }
1024
1025 #if     CONFIG_DTRACE
1026         /*
1027          * Decide what latencies we suffered that are Dtrace events.
1028          * If we have set wait_interval, then we either spun or slept.
1029          * At least we get out from under the interlock before we record
1030          * which is the best we can do here to minimize the impact
1031          * of the tracing.
1032          * If we have set wait_interval to -1, then dtrace was not enabled when we
1033          * started sleeping/spinning so we don't record this event.
1034          */
1035         if (dtrace_ls_enabled == TRUE) {
1036                 if (slept == 0) {
1037                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1038                             mach_absolute_time() - wait_interval, 1);
1039                 } else {
1040                         /*
1041                          * For the blocking case, we also record if when we blocked
1042                          * it was held for read or write, and how many readers.
1043                          * Notice that above we recorded this before we dropped
1044                          * the interlock so the count is accurate.
1045                          */
1046                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1047                             mach_absolute_time() - wait_interval, 1,
1048                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1049                 }
1050         }
1051         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1052 #endif
1053 }
1054
1055
1056 /*
1057  *      Routine:        lck_rw_done_gen
1058  *
1059  *      called from the assembly language wrapper...
1060  *      prior_lock_state is the value in the 1st
1061  *      word of the lock at the time of a successful
1062  *      atomic compare and exchange with the new value...
1063  *      it represents the state of the lock before we
1064  *      decremented the rw_shared_count or cleared either
1065  *      rw_want_upgrade or rw_want_write and
1066  *      the lck_x_waiting bits...  since the wrapper
1067  *      routine has already changed the state atomically,
1068  *      we just need to decide if we should
1069  *      wake up anyone and what value to return... we do
1070  *      this by examining the state of the lock before
1071  *      we changed it
1072  */
1073 lck_rw_type_t
1074 lck_rw_done_gen(
1075         lck_rw_t        *lck,
1076         int             prior_lock_state)
1077 {
1078         lck_rw_t        *fake_lck;
1079         lck_rw_type_t   lock_type;
1080         thread_t        thread;
1081         uint32_t        rwlock_count;
1082
1083         /*
1084          * prior_lock state is a snapshot of the 1st word of the
1085          * lock in question... we'll fake up a pointer to it
1086          * and carefully not access anything beyond whats defined
1087          * in the first word of a lck_rw_t
1088          */
1089         fake_lck = (lck_rw_t *)&prior_lock_state;
1090
1091         if (fake_lck->lck_rw_shared_count <= 1) {
1092                 if (fake_lck->lck_w_waiting)
1093                         thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1094
1095                 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1096                         thread_wakeup(RW_LOCK_READER_EVENT(lck));
1097         }
1098         if (fake_lck->lck_rw_shared_count)
1099                 lock_type = LCK_RW_TYPE_SHARED;
1100         else
1101                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1102
1103         /* Check if dropping the lock means that we need to unpromote */
1104         thread = current_thread();
1105         rwlock_count = thread->rwlock_count--;
1106 #if MACH_LDEBUG
1107         if (rwlock_count == 0) {
1108                 panic("rw lock count underflow for thread %p", thread);
1109         }
1110 #endif
1111         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1112                 /* sched_flags checked without lock, but will be rechecked while clearing */
1113                 lck_rw_clear_promotion(thread);
1114         }
1115
1116 #if CONFIG_DTRACE
1117         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1118 #endif
1119
1120         return(lock_type);
1121 }
1122
1123
1124 /*
1125  *      Routine:        lck_rw_unlock
1126  */
1127 void
1128 lck_rw_unlock(
1129         lck_rw_t        *lck,
1130         lck_rw_type_t   lck_rw_type)
1131 {
1132         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1133                 lck_rw_unlock_shared(lck);
1134         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1135                 lck_rw_unlock_exclusive(lck);
1136         else
1137                 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1138 }
1139
1140
1141 /*
1142  *      Routine:        lck_rw_unlock_shared
1143  */
1144 void
1145 lck_rw_unlock_shared(
1146         lck_rw_t        *lck)
1147 {
1148         lck_rw_type_t   ret;
1149
1150         ret = lck_rw_done(lck);
1151
1152         if (ret != LCK_RW_TYPE_SHARED)
1153                 panic("lck_rw_unlock(): lock held in mode: %d\n", ret);
1154 }
1155
1156
1157 /*
1158  *      Routine:        lck_rw_unlock_exclusive
1159  */
1160 void
1161 lck_rw_unlock_exclusive(
1162         lck_rw_t        *lck)
1163 {
1164         lck_rw_type_t   ret;
1165
1166         ret = lck_rw_done(lck);
1167
1168         if (ret != LCK_RW_TYPE_EXCLUSIVE)
1169                 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1170 }
1171
1172
1173 /*
1174  *      Routine:        lck_rw_lock
1175  */
1176 void
1177 lck_rw_lock(
1178         lck_rw_t        *lck,
1179         lck_rw_type_t   lck_rw_type)
1180 {
1181         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1182                 lck_rw_lock_shared(lck);
1183         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1184                 lck_rw_lock_exclusive(lck);
1185         else
1186                 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1187 }
1188
1189
1190 /*
1191  *      Routine:        lck_rw_lock_shared_gen
1192  *      Function:
1193  *              assembly fast path code has determined that this lock
1194  *              is held exclusively... this is where we spin/block
1195  *              until we can acquire the lock in the shared mode
1196  */
1197 void
1198 lck_rw_lock_shared_gen(
1199         lck_rw_t        *lck)
1200 {
1201         uint64_t        deadline = 0;
1202         int             gotlock = 0;
1203         int             slept = 0;
1204         wait_result_t   res = 0;
1205         boolean_t       istate = -1;
1206
1207 #if     CONFIG_DTRACE
1208         uint64_t wait_interval = 0;
1209         int readers_at_sleep = 0;
1210         boolean_t dtrace_ls_initialized = FALSE;
1211         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1212 #endif
1213
1214         while ( !lck_rw_grab_shared(lck)) {
1215
1216 #if     CONFIG_DTRACE
1217                 if (dtrace_ls_initialized == FALSE) {
1218                         dtrace_ls_initialized = TRUE;
1219                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1220                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1221                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1222                         if (dtrace_ls_enabled) {
1223                                 /*
1224                                  * Either sleeping or spinning is happening,
1225                                  *  start a timing of our delay interval now.
1226                                  */
1227                                 readers_at_sleep = lck->lck_rw_shared_count;
1228                                 wait_interval = mach_absolute_time();
1229                         }
1230                 }
1231 #endif
1232                 if (istate == -1)
1233                         istate = ml_get_interrupts_enabled();
1234
1235                 deadline = lck_rw_deadline_for_spin(lck);
1236
1237                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1238                              (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1239
1240                 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1241                         lck_rw_lock_pause(istate);
1242
1243                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1244                              (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1245
1246                 if (gotlock)
1247                         break;
1248                 /*
1249                  * if we get here, the deadline has expired w/o us
1250                  * being able to grab the lock for read
1251                  * check to see if we're allowed to do a thread_block
1252                  */
1253                 if (lck->lck_rw_can_sleep) {
1254
1255                         istate = lck_interlock_lock(lck);
1256
1257                         if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1258                             ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1259
1260                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1261                                              (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1262
1263                                 lck->lck_r_waiting = TRUE;
1264
1265                                 res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1266                                 lck_interlock_unlock(lck, istate);
1267
1268                                 if (res == THREAD_WAITING) {
1269                                         res = thread_block(THREAD_CONTINUE_NULL);
1270                                         slept++;
1271                                 }
1272                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1273                                              (int)lck, res, slept, 0, 0);
1274                         } else {
1275                                 lck->lck_rw_shared_count++;
1276                                 lck_interlock_unlock(lck, istate);
1277                                 break;
1278                         }
1279                 }
1280         }
1281
1282 #if     CONFIG_DTRACE
1283         if (dtrace_ls_enabled == TRUE) {
1284                 if (slept == 0) {
1285                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1286                 } else {
1287                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1288                             mach_absolute_time() - wait_interval, 0,
1289                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1290                 }
1291         }
1292         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1293 #endif
1294 }
1295
1296
1297 /*
1298  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1299  *      Function:
1300  *              assembly fast path code has already dropped our read
1301  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1302  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1303  *              all we need to do here is determine if a wakeup is needed
1304  */
1305 boolean_t
1306 lck_rw_lock_shared_to_exclusive_failure(
1307         lck_rw_t        *lck,
1308         int             prior_lock_state)
1309 {
1310         lck_rw_t        *fake_lck;
1311         thread_t        thread = current_thread();
1312         uint32_t        rwlock_count;
1313
1314         /* Check if dropping the lock means that we need to unpromote */
1315         rwlock_count = thread->rwlock_count--;
1316 #if MACH_LDEBUG
1317         if (rwlock_count == 0) {
1318                 panic("rw lock count underflow for thread %p", thread);
1319         }
1320 #endif
1321         if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1322                 /* sched_flags checked without lock, but will be rechecked while clearing */
1323                 lck_rw_clear_promotion(thread);
1324         }
1325
1326         /*
1327          * prior_lock state is a snapshot of the 1st word of the
1328          * lock in question... we'll fake up a pointer to it
1329          * and carefully not access anything beyond whats defined
1330          * in the first word of a lck_rw_t
1331          */
1332         fake_lck = (lck_rw_t *)&prior_lock_state;
1333
1334         if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1335                 /*
1336                  *      Someone else has requested upgrade.
1337                  *      Since we've released the read lock, wake
1338                  *      him up if he's blocked waiting
1339                  */
1340                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1341         }
1342         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1343                      (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1344
1345         return (FALSE);
1346 }
1347
1348
1349 /*
1350  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1351  *      Function:
1352  *              assembly fast path code has already dropped our read
1353  *              count and successfully acquired 'lck_rw_want_upgrade'
1354  *              we just need to wait for the rest of the readers to drain
1355  *              and then we can return as the exclusive holder of this lock
1356  */
1357 boolean_t
1358 lck_rw_lock_shared_to_exclusive_success(
1359         lck_rw_t        *lck)
1360 {
1361         uint64_t        deadline = 0;
1362         int             slept = 0;
1363         int             still_shared = 0;
1364         wait_result_t   res;
1365         boolean_t       istate = -1;
1366
1367 #if     CONFIG_DTRACE
1368         uint64_t wait_interval = 0;
1369         int readers_at_sleep = 0;
1370         boolean_t dtrace_ls_initialized = FALSE;
1371         boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1372 #endif
1373
1374         while (lck->lck_rw_shared_count != 0) {
1375
1376 #if     CONFIG_DTRACE
1377                 if (dtrace_ls_initialized == FALSE) {
1378                         dtrace_ls_initialized = TRUE;
1379                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1380                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1381                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1382                         if (dtrace_ls_enabled) {
1383                                 /*
1384                                  * Either sleeping or spinning is happening,
1385                                  *  start a timing of our delay interval now.
1386                                  */
1387                                 readers_at_sleep = lck->lck_rw_shared_count;
1388                                 wait_interval = mach_absolute_time();
1389                         }
1390                 }
1391 #endif
1392                 if (istate == -1)
1393                         istate = ml_get_interrupts_enabled();
1394
1395                 deadline = lck_rw_deadline_for_spin(lck);
1396
1397                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1398                              (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1399
1400                 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1401                         lck_rw_lock_pause(istate);
1402
1403                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1404                              (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1405
1406                 if ( !still_shared)
1407                         break;
1408                 /*
1409                  * if we get here, the deadline has expired w/o
1410                  * the rw_shared_count having drained to 0
1411                  * check to see if we're allowed to do a thread_block
1412                  */
1413                 if (lck->lck_rw_can_sleep) {
1414
1415                         istate = lck_interlock_lock(lck);
1416
1417                         if (lck->lck_rw_shared_count != 0) {
1418                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1419                                              (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1420
1421                                 lck->lck_w_waiting = TRUE;
1422
1423                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1424                                 lck_interlock_unlock(lck, istate);
1425
1426                                 if (res == THREAD_WAITING) {
1427                                         res = thread_block(THREAD_CONTINUE_NULL);
1428                                         slept++;
1429                                 }
1430                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1431                                              (int)lck, res, slept, 0, 0);
1432                         } else {
1433                                 lck_interlock_unlock(lck, istate);
1434                                 break;
1435                         }
1436                 }
1437         }
1438 #if     CONFIG_DTRACE
1439         /*
1440          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1441          */
1442         if (dtrace_ls_enabled == TRUE) {
1443                 if (slept == 0) {
1444                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1445                 } else {
1446                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1447                             mach_absolute_time() - wait_interval, 1,
1448                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1449                 }
1450         }
1451         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1452 #endif
1453         return (TRUE);
1454 }
1455
1456
1457 /*
1458  *      Routine:        lck_rw_lock_exclusive_to_shared
1459  *      Function:
1460  *              assembly fast path has already dropped
1461  *              our exclusive state and bumped lck_rw_shared_count
1462  *              all we need to do here is determine if anyone
1463  *              needs to be awakened.
1464  */
1465 void
1466 lck_rw_lock_exclusive_to_shared_gen(
1467         lck_rw_t        *lck,
1468         int             prior_lock_state)
1469 {
1470         lck_rw_t        *fake_lck;
1471
1472         /*
1473          * prior_lock state is a snapshot of the 1st word of the
1474          * lock in question... we'll fake up a pointer to it
1475          * and carefully not access anything beyond whats defined
1476          * in the first word of a lck_rw_t
1477          */
1478         fake_lck = (lck_rw_t *)&prior_lock_state;
1479
1480         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1481                              (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1482
1483         /*
1484          * don't wake up anyone waiting to take the lock exclusively
1485          * since we hold a read count... when the read count drops to 0,
1486          * the writers will be woken.
1487          *
1488          * wake up any waiting readers if we don't have any writers waiting,
1489          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1490          */
1491         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1492                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1493
1494         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1495                              (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1496
1497 #if CONFIG_DTRACE
1498         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1499 #endif
1500 }
1501
1502
1503 /*
1504  *      Routine:        lck_rw_try_lock
1505  */
1506 boolean_t
1507 lck_rw_try_lock(
1508         lck_rw_t        *lck,
1509         lck_rw_type_t   lck_rw_type)
1510 {
1511         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1512                 return(lck_rw_try_lock_shared(lck));
1513         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1514                 return(lck_rw_try_lock_exclusive(lck));
1515         else
1516                 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1517         return(FALSE);
1518 }
1519
1520
1521 void
1522 lck_rw_assert(
1523         lck_rw_t        *lck,
1524         unsigned int    type)
1525 {
1526         switch (type) {
1527         case LCK_RW_ASSERT_SHARED:
1528                 if (lck->lck_rw_shared_count != 0) {
1529                         return;
1530                 }
1531                 break;
1532         case LCK_RW_ASSERT_EXCLUSIVE:
1533                 if ((lck->lck_rw_want_write ||
1534                      lck->lck_rw_want_upgrade) &&
1535                     lck->lck_rw_shared_count == 0) {
1536                         return;
1537                 }
1538                 break;
1539         case LCK_RW_ASSERT_HELD:
1540                 if (lck->lck_rw_want_write ||
1541                     lck->lck_rw_want_upgrade ||
1542                     lck->lck_rw_shared_count != 0) {
1543                         return;
1544                 }
1545                 break;
1546         case LCK_RW_ASSERT_NOTHELD:
1547                 if (!(lck->lck_rw_want_write ||
1548                           lck->lck_rw_want_upgrade ||
1549                           lck->lck_rw_shared_count != 0)) {
1550                         return;
1551                 }
1552                 break;
1553         default:
1554                 break;
1555         }
1556
1557         panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1558 }
1559
1560 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1561 void
1562 lck_rw_clear_promotions_x86(thread_t thread)
1563 {
1564 #if MACH_LDEBUG
1565         /* It's fatal to leave a RW lock locked and return to userspace */
1566         panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1567 #else
1568         /* Paper over the issue */
1569         thread->rwlock_count = 0;
1570         lck_rw_clear_promotion(thread);
1571 #endif
1572 }
1573
1574
1575 #ifdef  MUTEX_ZONE
1576 extern zone_t lck_mtx_zone;
1577 #endif
1578 /*
1579  *      Routine:        lck_mtx_alloc_init
1580  */
1581 lck_mtx_t *
1582 lck_mtx_alloc_init(
1583         lck_grp_t       *grp,
1584         lck_attr_t      *attr)
1585 {
1586         lck_mtx_t       *lck;
1587 #ifdef  MUTEX_ZONE
1588         if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
1589                 lck_mtx_init(lck, grp, attr);
1590 #else
1591         if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
1592                 lck_mtx_init(lck, grp, attr);
1593 #endif
1594         return(lck);
1595 }
1596
1597 /*
1598  *      Routine:        lck_mtx_free
1599  */
1600 void
1601 lck_mtx_free(
1602         lck_mtx_t       *lck,
1603         lck_grp_t       *grp)
1604 {
1605         lck_mtx_destroy(lck, grp);
1606 #ifdef  MUTEX_ZONE
1607         zfree(lck_mtx_zone, lck);
1608 #else
1609         kfree(lck, sizeof(lck_mtx_t));
1610 #endif
1611 }
1612
1613 /*
1614  *      Routine:        lck_mtx_ext_init
1615  */
1616 static void
1617 lck_mtx_ext_init(
1618         lck_mtx_ext_t   *lck,
1619         lck_grp_t       *grp,
1620         lck_attr_t      *attr)
1621 {
1622         bzero((void *)lck, sizeof(lck_mtx_ext_t));
1623
1624         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1625                 lck->lck_mtx_deb.type = MUTEX_TAG;
1626                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
1627         }
1628
1629         lck->lck_mtx_grp = grp;
1630
1631         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
1632                 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
1633
1634         lck->lck_mtx.lck_mtx_is_ext = 1;
1635         lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1636 }
1637
1638 /*
1639  *      Routine:        lck_mtx_init
1640  */
1641 void
1642 lck_mtx_init(
1643         lck_mtx_t       *lck,
1644         lck_grp_t       *grp,
1645         lck_attr_t      *attr)
1646 {
1647         lck_mtx_ext_t   *lck_ext;
1648         lck_attr_t      *lck_attr;
1649
1650         if (attr != LCK_ATTR_NULL)
1651                 lck_attr = attr;
1652         else
1653                 lck_attr = &LockDefaultLckAttr;
1654
1655         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1656                 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
1657                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
1658                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1659                         lck->lck_mtx_ptr = lck_ext;
1660                 }
1661         } else {
1662                 lck->lck_mtx_owner = 0;
1663                 lck->lck_mtx_state = 0;
1664         }
1665         lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1666         lck_grp_reference(grp);
1667         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1668 }
1669
1670 /*
1671  *      Routine:        lck_mtx_init_ext
1672  */
1673 void
1674 lck_mtx_init_ext(
1675         lck_mtx_t       *lck,
1676         lck_mtx_ext_t   *lck_ext,
1677         lck_grp_t       *grp,
1678         lck_attr_t      *attr)
1679 {
1680         lck_attr_t      *lck_attr;
1681
1682         if (attr != LCK_ATTR_NULL)
1683                 lck_attr = attr;
1684         else
1685                 lck_attr = &LockDefaultLckAttr;
1686
1687         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1688                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
1689                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1690                 lck->lck_mtx_ptr = lck_ext;
1691         } else {
1692                 lck->lck_mtx_owner = 0;
1693                 lck->lck_mtx_state = 0;
1694         }
1695         lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1696
1697         lck_grp_reference(grp);
1698         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1699 }
1700
1701 /*
1702  *      Routine:        lck_mtx_destroy
1703  */
1704 void
1705 lck_mtx_destroy(
1706         lck_mtx_t       *lck,
1707         lck_grp_t       *grp)
1708 {
1709         boolean_t lck_is_indirect;
1710
1711         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
1712                 return;
1713 #if MACH_LDEBUG
1714         lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1715 #endif
1716         lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1717
1718         lck_mtx_lock_mark_destroyed(lck);
1719
1720         if (lck_is_indirect)
1721                 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
1722         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1723         lck_grp_deallocate(grp);
1724         return;
1725 }
1726
1727
1728 #define LCK_MTX_LCK_WAIT_CODE           0x20
1729 #define LCK_MTX_LCK_WAKEUP_CODE         0x21
1730 #define LCK_MTX_LCK_SPIN_CODE           0x22
1731 #define LCK_MTX_LCK_ACQUIRE_CODE        0x23
1732 #define LCK_MTX_LCK_DEMOTE_CODE         0x24
1733
1734
1735 /*
1736  * Routine:     lck_mtx_unlock_wakeup_x86
1737  *
1738  * Invoked on unlock when there is
1739  * contention (i.e. the assembly routine sees that
1740  * that mutex->lck_mtx_waiters != 0 or
1741  * that mutex->lck_mtx_promoted != 0...
1742  *
1743  * neither the mutex or interlock is held
1744  */
1745 void
1746 lck_mtx_unlock_wakeup_x86 (
1747         lck_mtx_t       *mutex,
1748         int             prior_lock_state)
1749 {
1750         lck_mtx_t       fake_lck;
1751
1752         /*
1753          * prior_lock state is a snapshot of the 2nd word of the
1754          * lock in question... we'll fake up a lock with the bits
1755          * copied into place and carefully not access anything
1756          * beyond whats defined in the second word of a lck_mtx_t
1757          */
1758         fake_lck.lck_mtx_state = prior_lock_state;
1759
1760         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
1761                      mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
1762
1763         if (__probable(fake_lck.lck_mtx_waiters)) {
1764                 if (fake_lck.lck_mtx_waiters > 1)
1765                         thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri);
1766                 else
1767                         thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
1768         }
1769
1770         if (__improbable(fake_lck.lck_mtx_promoted)) {
1771                 thread_t        thread = current_thread();
1772
1773
1774                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
1775                              thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
1776
1777                 if (thread->promotions > 0) {
1778                         spl_t   s = splsched();
1779
1780                         thread_lock(thread);
1781
1782                         if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
1783
1784                                 thread->sched_flags &= ~TH_SFLAG_PROMOTED;
1785
1786                                 if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
1787                                         /* Thread still has a RW lock promotion */
1788                                 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1789                                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1790                                                               thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
1791
1792                                         set_sched_pri(thread, DEPRESSPRI);
1793                                 }
1794                                 else {
1795                                         if (thread->priority < thread->sched_pri) {
1796                                                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1797                                                                       thread->sched_pri, thread->priority, 0, mutex, 0);
1798
1799                                                 SCHED(compute_priority)(thread, FALSE);
1800                                         }
1801                                 }
1802                         }
1803                         thread_unlock(thread);
1804                         splx(s);
1805                 }
1806         }
1807         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
1808                      mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1809 }
1810
1811
1812 /*
1813  * Routine:     lck_mtx_lock_acquire_x86
1814  *
1815  * Invoked on acquiring the mutex when there is
1816  * contention (i.e. the assembly routine sees that
1817  * that mutex->lck_mtx_waiters != 0 or
1818  * thread->was_promoted_on_wakeup != 0)...
1819  *
1820  * mutex is owned...  interlock is held... preemption is disabled
1821  */
1822 void
1823 lck_mtx_lock_acquire_x86(
1824         lck_mtx_t       *mutex)
1825 {
1826         thread_t        thread;
1827         integer_t       priority;
1828         spl_t           s;
1829
1830         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
1831                      mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
1832
1833         if (mutex->lck_mtx_waiters)
1834                 priority = mutex->lck_mtx_pri;
1835         else
1836                 priority = 0;
1837
1838         thread = (thread_t)mutex->lck_mtx_owner;        /* faster then current_thread() */
1839
1840         if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
1841
1842                 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
1843                                       thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0);
1844
1845                 s = splsched();
1846                 thread_lock(thread);
1847
1848                 if (thread->sched_pri < priority) {
1849                         /* Do not promote past promotion ceiling */
1850                         assert(priority <= MAXPRI_PROMOTE);
1851                         set_sched_pri(thread, priority);
1852                 }
1853                 if (mutex->lck_mtx_promoted == 0) {
1854                         mutex->lck_mtx_promoted = 1;
1855
1856                         thread->promotions++;
1857                         thread->sched_flags |= TH_SFLAG_PROMOTED;
1858                 }
1859                 thread->was_promoted_on_wakeup = 0;
1860
1861                 thread_unlock(thread);
1862                 splx(s);
1863         }
1864         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
1865                      mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1866 }
1867
1868
1869
1870 /*
1871  * Routine:     lck_mtx_lock_spinwait_x86
1872  *
1873  * Invoked trying to acquire a mutex when there is contention but
1874  * the holder is running on another processor. We spin for up to a maximum
1875  * time waiting for the lock to be released.
1876  *
1877  * Called with the interlock unlocked.
1878  * returns 0 if mutex acquired
1879  * returns 1 if we spun
1880  * returns 2 if we didn't spin due to the holder not running
1881  */
1882 int
1883 lck_mtx_lock_spinwait_x86(
1884         lck_mtx_t       *mutex)
1885 {
1886         thread_t        holder;
1887         uint64_t        deadline;
1888         int             retval = 1;
1889         int             loopcount = 0;
1890
1891
1892         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
1893                      mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
1894
1895         deadline = mach_absolute_time() + MutexSpin;
1896
1897         /*
1898          * Spin while:
1899          *   - mutex is locked, and
1900          *   - its locked as a spin lock, and
1901          *   - owner is running on another processor, and
1902          *   - owner (processor) is not idling, and
1903          *   - we haven't spun for long enough.
1904          */
1905         do {
1906                 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
1907                         retval = 0;
1908                         break;
1909                 }
1910                 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
1911
1912                         if ( !(holder->machine.specFlags & OnProc) ||
1913                              (holder->state & TH_IDLE)) {
1914                                 if (loopcount == 0)
1915                                         retval = 2;
1916                                 break;
1917                         }
1918                 }
1919                 cpu_pause();
1920
1921                 loopcount++;
1922
1923         } while (mach_absolute_time() < deadline);
1924
1925
1926 #if     CONFIG_DTRACE
1927         /*
1928          * We've already kept a count via deadline of how long we spun.
1929          * If dtrace is active, then we compute backwards to decide how
1930          * long we spun.
1931          *
1932          * Note that we record a different probe id depending on whether
1933          * this is a direct or indirect mutex.  This allows us to
1934          * penalize only lock groups that have debug/stats enabled
1935          * with dtrace processing if desired.
1936          */
1937         if (__probable(mutex->lck_mtx_is_ext == 0)) {
1938                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
1939                     mach_absolute_time() - (deadline - MutexSpin));
1940         } else {
1941                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
1942                     mach_absolute_time() - (deadline - MutexSpin));
1943         }
1944         /* The lockstat acquire event is recorded by the assembly code beneath us. */
1945 #endif
1946
1947         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
1948                      mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
1949
1950         return retval;
1951 }
1952
1953
1954
1955 /*
1956  * Routine:     lck_mtx_lock_wait_x86
1957  *
1958  * Invoked in order to wait on contention.
1959  *
1960  * Called with the interlock locked and
1961  * preemption disabled...
1962  * returns it unlocked and with preemption enabled
1963  */
1964 void
1965 lck_mtx_lock_wait_x86 (
1966         lck_mtx_t       *mutex)
1967 {
1968         thread_t        self = current_thread();
1969         thread_t        holder;
1970         integer_t       priority;
1971         spl_t           s;
1972 #if     CONFIG_DTRACE
1973         uint64_t        sleep_start = 0;
1974
1975         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
1976                 sleep_start = mach_absolute_time();
1977         }
1978 #endif
1979         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
1980                      mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
1981
1982         priority = self->sched_pri;
1983
1984         if (priority < self->priority)
1985                 priority = self->priority;
1986         if (priority < BASEPRI_DEFAULT)
1987                 priority = BASEPRI_DEFAULT;
1988
1989         /* Do not promote past promotion ceiling */
1990         priority = MIN(priority, MAXPRI_PROMOTE);
1991
1992         if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
1993                 mutex->lck_mtx_pri = priority;
1994         mutex->lck_mtx_waiters++;
1995
1996         if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
1997              holder->sched_pri < mutex->lck_mtx_pri ) {
1998                 s = splsched();
1999                 thread_lock(holder);
2000
2001                 /* holder priority may have been bumped by another thread
2002                  * before thread_lock was taken
2003                  */
2004                 if (holder->sched_pri < mutex->lck_mtx_pri) {
2005                         KERNEL_DEBUG_CONSTANT(
2006                                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2007                                 holder->sched_pri, priority, thread_tid(holder), mutex, 0);
2008                         /* Assert that we're not altering the priority of a
2009                          * thread above the MAXPRI_PROMOTE band
2010                          */
2011                         assert(holder->sched_pri < MAXPRI_PROMOTE);
2012                         set_sched_pri(holder, priority);
2013
2014                         if (mutex->lck_mtx_promoted == 0) {
2015                                 holder->promotions++;
2016                                 holder->sched_flags |= TH_SFLAG_PROMOTED;
2017
2018                                 mutex->lck_mtx_promoted = 1;
2019                         }
2020                 }
2021                 thread_unlock(holder);
2022                 splx(s);
2023         }
2024         assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
2025
2026         lck_mtx_ilk_unlock(mutex);
2027
2028         thread_block(THREAD_CONTINUE_NULL);
2029
2030         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2031                      mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2032
2033 #if     CONFIG_DTRACE
2034         /*
2035          * Record the Dtrace lockstat probe for blocking, block time
2036          * measured from when we were entered.
2037          */
2038         if (sleep_start) {
2039                 if (mutex->lck_mtx_is_ext == 0) {
2040                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2041                             mach_absolute_time() - sleep_start);
2042                 } else {
2043                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2044                             mach_absolute_time() - sleep_start);
2045                 }
2046         }
2047 #endif
2048 }