osfmk/i386/locks_i386.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  *      File:   kern/lock.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *      Date:   1985
  60  *
  61  *      Locking primitives implementation
  62  */
  63
  64 #include <mach_kdb.h>
  65 #include <mach_ldebug.h>
  66
  67 #include <kern/lock.h>
  68 #include <kern/locks.h>
  69 #include <kern/kalloc.h>
  70 #include <kern/misc_protos.h>
  71 #include <kern/thread.h>
  72 #include <kern/processor.h>
  73 #include <kern/cpu_data.h>
  74 #include <kern/cpu_number.h>
  75 #include <kern/sched_prim.h>
  76 #include <kern/xpr.h>
  77 #include <kern/debug.h>
  78 #include <string.h>
  79
  80 #include <i386/mp.h> /* mp_recent_debugger_activity() */
  81 #if     MACH_KDB
  82 #include <ddb/db_command.h>
  83 #include <ddb/db_output.h>
  84 #include <ddb/db_sym.h>
  85 #include <ddb/db_print.h>
  86 #endif  /* MACH_KDB */
  87
  88 #include <machine/machine_cpu.h>
  89
  90 #include <sys/kdebug.h>
  91
  92 /*
  93  * We need only enough declarations from the BSD-side to be able to
  94  * test if our probe is active, and to call __dtrace_probe().  Setting
  95  * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
  96  */
  97 #if     CONFIG_DTRACE
  98 #define NEED_DTRACE_DEFS
  99 #include <../bsd/sys/lockstat.h>
 100 #endif
 101
 102 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
 103 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
 104 #define LCK_RW_LCK_SHARED_CODE          0x102
 105 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
 106 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
 107 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
 108
 109 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
 110 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
 111 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
 112 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
 113 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
 114 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
 115 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
 116 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
 117
 118
 119 #define ANY_LOCK_DEBUG  (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
 120
 121 unsigned int LcksOpts=0;
 122
 123 /* Forwards */
 124
 125 #if     MACH_KDB
 126 void    db_print_simple_lock(
 127                         simple_lock_t   addr);
 128 #endif  /* MACH_KDB */
 129
 130
 131 #if     USLOCK_DEBUG
 132 /*
 133  *      Perform simple lock checks.
 134  */
 135 int     uslock_check = 1;
 136 int     max_lock_loops  = 100000000;
 137 decl_simple_lock_data(extern , printf_lock)
 138 decl_simple_lock_data(extern , panic_lock)
 139 #endif  /* USLOCK_DEBUG */
 140
 141
 142 /*
 143  *      We often want to know the addresses of the callers
 144  *      of the various lock routines.  However, this information
 145  *      is only used for debugging and statistics.
 146  */
 147 typedef void    *pc_t;
 148 #define INVALID_PC      ((void *) VM_MAX_KERNEL_ADDRESS)
 149 #define INVALID_THREAD  ((void *) VM_MAX_KERNEL_ADDRESS)
 150 #if     ANY_LOCK_DEBUG
 151 #define OBTAIN_PC(pc)   ((pc) = GET_RETURN_PC())
 152 #define DECL_PC(pc)     pc_t pc;
 153 #else   /* ANY_LOCK_DEBUG */
 154 #define DECL_PC(pc)
 155 #ifdef  lint
 156 /*
 157  *      Eliminate lint complaints about unused local pc variables.
 158  */
 159 #define OBTAIN_PC(pc)   ++pc
 160 #else   /* lint */
 161 #define OBTAIN_PC(pc)
 162 #endif  /* lint */
 163 #endif  /* USLOCK_DEBUG */
 164
 165
 166 /*
 167  *      Portable lock package implementation of usimple_locks.
 168  */
 169
 170 #if     USLOCK_DEBUG
 171 #define USLDBG(stmt)    stmt
 172 void            usld_lock_init(usimple_lock_t, unsigned short);
 173 void            usld_lock_pre(usimple_lock_t, pc_t);
 174 void            usld_lock_post(usimple_lock_t, pc_t);
 175 void            usld_unlock(usimple_lock_t, pc_t);
 176 void            usld_lock_try_pre(usimple_lock_t, pc_t);
 177 void            usld_lock_try_post(usimple_lock_t, pc_t);
 178 int             usld_lock_common_checks(usimple_lock_t, char *);
 179 #else   /* USLOCK_DEBUG */
 180 #define USLDBG(stmt)
 181 #endif  /* USLOCK_DEBUG */
 182
 183
 184 extern int lck_rw_grab_want(lck_rw_t *lck);
 185 extern int lck_rw_grab_shared(lck_rw_t *lck);
 186 extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
 187
 188
 189 /*
 190  * Forward definitions
 191  */
 192
 193 void lck_rw_lock_shared_gen(
 194         lck_rw_t        *lck);
 195
 196 void lck_rw_lock_exclusive_gen(
 197         lck_rw_t        *lck);
 198
 199 boolean_t lck_rw_lock_shared_to_exclusive_success(
 200         lck_rw_t        *lck);
 201
 202 boolean_t lck_rw_lock_shared_to_exclusive_failure(
 203         lck_rw_t        *lck,
 204         int             prior_lock_state);
 205
 206 void lck_rw_lock_exclusive_to_shared_gen(
 207         lck_rw_t        *lck,
 208         int             prior_lock_state);
 209
 210 lck_rw_type_t lck_rw_done_gen(
 211         lck_rw_t        *lck,
 212         int             prior_lock_state);
 213
 214
 215 /*
 216  *      Routine:        lck_spin_alloc_init
 217  */
 218 lck_spin_t *
 219 lck_spin_alloc_init(
 220         lck_grp_t       *grp,
 221         lck_attr_t      *attr)
 222 {
 223         lck_spin_t      *lck;
 224
 225         if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
 226                 lck_spin_init(lck, grp, attr);
 227
 228         return(lck);
 229 }
 230
 231 /*
 232  *      Routine:        lck_spin_free
 233  */
 234 void
 235 lck_spin_free(
 236         lck_spin_t      *lck,
 237         lck_grp_t       *grp)
 238 {
 239         lck_spin_destroy(lck, grp);
 240         kfree(lck, sizeof(lck_spin_t));
 241 }
 242
 243 /*
 244  *      Routine:        lck_spin_init
 245  */
 246 void
 247 lck_spin_init(
 248         lck_spin_t      *lck,
 249         lck_grp_t       *grp,
 250         __unused lck_attr_t     *attr)
 251 {
 252         usimple_lock_init((usimple_lock_t) lck, 0);
 253         lck_grp_reference(grp);
 254         lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
 255 }
 256
 257 /*
 258  *      Routine:        lck_spin_destroy
 259  */
 260 void
 261 lck_spin_destroy(
 262         lck_spin_t      *lck,
 263         lck_grp_t       *grp)
 264 {
 265         if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
 266                 return;
 267         lck->interlock = LCK_SPIN_TAG_DESTROYED;
 268         lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
 269         lck_grp_deallocate(grp);
 270         return;
 271 }
 272
 273 /*
 274  *      Routine:        lck_spin_lock
 275  */
 276 void
 277 lck_spin_lock(
 278         lck_spin_t      *lck)
 279 {
 280         usimple_lock((usimple_lock_t) lck);
 281 }
 282
 283 /*
 284  *      Routine:        lck_spin_unlock
 285  */
 286 void
 287 lck_spin_unlock(
 288         lck_spin_t      *lck)
 289 {
 290         usimple_unlock((usimple_lock_t) lck);
 291 }
 292
 293
 294 /*
 295  *      Routine:        lck_spin_try_lock
 296  */
 297 boolean_t
 298 lck_spin_try_lock(
 299         lck_spin_t      *lck)
 300 {
 301         return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
 302 }
 303
 304 /*
 305  *      Initialize a usimple_lock.
 306  *
 307  *      No change in preemption state.
 308  */
 309 void
 310 usimple_lock_init(
 311         usimple_lock_t  l,
 312         __unused unsigned short tag)
 313 {
 314 #ifndef MACHINE_SIMPLE_LOCK
 315         USLDBG(usld_lock_init(l, tag));
 316         hw_lock_init(&l->interlock);
 317 #else
 318         simple_lock_init((simple_lock_t)l,tag);
 319 #endif
 320 }
 321
 322
 323 /*
 324  *      Acquire a usimple_lock.
 325  *
 326  *      Returns with preemption disabled.  Note
 327  *      that the hw_lock routines are responsible for
 328  *      maintaining preemption state.
 329  */
 330 void
 331 usimple_lock(
 332         usimple_lock_t  l)
 333 {
 334 #ifndef MACHINE_SIMPLE_LOCK
 335         DECL_PC(pc);
 336
 337         OBTAIN_PC(pc);
 338         USLDBG(usld_lock_pre(l, pc));
 339
 340         if(!hw_lock_to(&l->interlock, LockTimeOutTSC))  {/* Try to get the lock
 341                                                           * with a timeout */
 342                 boolean_t uslock_acquired = FALSE;
 343                 while (mp_recent_debugger_activity() &&
 344                     !(uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)));
 345                 if (uslock_acquired == FALSE)
 346                         panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p", l, (uintptr_t)l->interlock.lock_data, current_thread());
 347         }
 348         USLDBG(usld_lock_post(l, pc));
 349 #else
 350         simple_lock((simple_lock_t)l);
 351 #endif
 352 }
 353
 354
 355 /*
 356  *      Release a usimple_lock.
 357  *
 358  *      Returns with preemption enabled.  Note
 359  *      that the hw_lock routines are responsible for
 360  *      maintaining preemption state.
 361  */
 362 void
 363 usimple_unlock(
 364         usimple_lock_t  l)
 365 {
 366 #ifndef MACHINE_SIMPLE_LOCK
 367         DECL_PC(pc);
 368
 369         OBTAIN_PC(pc);
 370         USLDBG(usld_unlock(l, pc));
 371         hw_lock_unlock(&l->interlock);
 372 #else
 373         simple_unlock_rwmb((simple_lock_t)l);
 374 #endif
 375 }
 376
 377
 378 /*
 379  *      Conditionally acquire a usimple_lock.
 380  *
 381  *      On success, returns with preemption disabled.
 382  *      On failure, returns with preemption in the same state
 383  *      as when first invoked.  Note that the hw_lock routines
 384  *      are responsible for maintaining preemption state.
 385  *
 386  *      XXX No stats are gathered on a miss; I preserved this
 387  *      behavior from the original assembly-language code, but
 388  *      doesn't it make sense to log misses?  XXX
 389  */
 390 unsigned int
 391 usimple_lock_try(
 392         usimple_lock_t  l)
 393 {
 394 #ifndef MACHINE_SIMPLE_LOCK
 395         unsigned int    success;
 396         DECL_PC(pc);
 397
 398         OBTAIN_PC(pc);
 399         USLDBG(usld_lock_try_pre(l, pc));
 400         if ((success = hw_lock_try(&l->interlock))) {
 401                 USLDBG(usld_lock_try_post(l, pc));
 402         }
 403         return success;
 404 #else
 405         return(simple_lock_try((simple_lock_t)l));
 406 #endif
 407 }
 408
 409 #if     USLOCK_DEBUG
 410 /*
 411  *      States of a usimple_lock.  The default when initializing
 412  *      a usimple_lock is setting it up for debug checking.
 413  */
 414 #define USLOCK_CHECKED          0x0001          /* lock is being checked */
 415 #define USLOCK_TAKEN            0x0002          /* lock has been taken */
 416 #define USLOCK_INIT             0xBAA0          /* lock has been initialized */
 417 #define USLOCK_INITIALIZED      (USLOCK_INIT|USLOCK_CHECKED)
 418 #define USLOCK_CHECKING(l)      (uslock_check &&                        \
 419                                  ((l)->debug.state & USLOCK_CHECKED))
 420
 421 /*
 422  *      Trace activities of a particularly interesting lock.
 423  */
 424 void    usl_trace(usimple_lock_t, int, pc_t, const char *);
 425
 426
 427 /*
 428  *      Initialize the debugging information contained
 429  *      in a usimple_lock.
 430  */
 431 void
 432 usld_lock_init(
 433         usimple_lock_t  l,
 434         __unused unsigned short tag)
 435 {
 436         if (l == USIMPLE_LOCK_NULL)
 437                 panic("lock initialization:  null lock pointer");
 438         l->lock_type = USLOCK_TAG;
 439         l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
 440         l->debug.lock_cpu = l->debug.unlock_cpu = 0;
 441         l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
 442         l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
 443         l->debug.duration[0] = l->debug.duration[1] = 0;
 444         l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
 445         l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
 446         l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
 447 }
 448
 449
 450 /*
 451  *      These checks apply to all usimple_locks, not just
 452  *      those with USLOCK_CHECKED turned on.
 453  */
 454 int
 455 usld_lock_common_checks(
 456         usimple_lock_t  l,
 457         char            *caller)
 458 {
 459         if (l == USIMPLE_LOCK_NULL)
 460                 panic("%s:  null lock pointer", caller);
 461         if (l->lock_type != USLOCK_TAG)
 462                 panic("%s:  0x%p is not a usimple lock", caller, l);
 463         if (!(l->debug.state & USLOCK_INIT))
 464                 panic("%s:  %p is not an initialized lock",
 465                       caller, l);
 466         return USLOCK_CHECKING(l);
 467 }
 468
 469
 470 /*
 471  *      Debug checks on a usimple_lock just before attempting
 472  *      to acquire it.
 473  */
 474 /* ARGSUSED */
 475 void
 476 usld_lock_pre(
 477         usimple_lock_t  l,
 478         pc_t            pc)
 479 {
 480         char    caller[] = "usimple_lock";
 481
 482
 483         if (!usld_lock_common_checks(l, caller))
 484                 return;
 485
 486 /*
 487  *      Note that we have a weird case where we are getting a lock when we are]
 488  *      in the process of putting the system to sleep. We are running with no
 489  *      current threads, therefore we can't tell if we are trying to retake a lock
 490  *      we have or someone on the other processor has it.  Therefore we just
 491  *      ignore this test if the locking thread is 0.
 492  */
 493
 494         if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
 495             l->debug.lock_thread == (void *) current_thread()) {
 496                 printf("%s:  lock %p already locked (at %p) by",
 497                       caller, l, l->debug.lock_pc);
 498                 printf(" current thread %p (new attempt at pc %p)\n",
 499                        l->debug.lock_thread, pc);
 500                 panic("%s", caller);
 501         }
 502         mp_disable_preemption();
 503         usl_trace(l, cpu_number(), pc, caller);
 504         mp_enable_preemption();
 505 }
 506
 507
 508 /*
 509  *      Debug checks on a usimple_lock just after acquiring it.
 510  *
 511  *      Pre-emption has been disabled at this point,
 512  *      so we are safe in using cpu_number.
 513  */
 514 void
 515 usld_lock_post(
 516         usimple_lock_t  l,
 517         pc_t            pc)
 518 {
 519         register int    mycpu;
 520         char    caller[] = "successful usimple_lock";
 521
 522
 523         if (!usld_lock_common_checks(l, caller))
 524                 return;
 525
 526         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 527                 panic("%s:  lock %p became uninitialized",
 528                       caller, l);
 529         if ((l->debug.state & USLOCK_TAKEN))
 530                 panic("%s:  lock 0x%p became TAKEN by someone else",
 531                       caller, l);
 532
 533         mycpu = cpu_number();
 534         l->debug.lock_thread = (void *)current_thread();
 535         l->debug.state |= USLOCK_TAKEN;
 536         l->debug.lock_pc = pc;
 537         l->debug.lock_cpu = mycpu;
 538
 539         usl_trace(l, mycpu, pc, caller);
 540 }
 541
 542
 543 /*
 544  *      Debug checks on a usimple_lock just before
 545  *      releasing it.  Note that the caller has not
 546  *      yet released the hardware lock.
 547  *
 548  *      Preemption is still disabled, so there's
 549  *      no problem using cpu_number.
 550  */
 551 void
 552 usld_unlock(
 553         usimple_lock_t  l,
 554         pc_t            pc)
 555 {
 556         register int    mycpu;
 557         char    caller[] = "usimple_unlock";
 558
 559
 560         if (!usld_lock_common_checks(l, caller))
 561                 return;
 562
 563         mycpu = cpu_number();
 564
 565         if (!(l->debug.state & USLOCK_TAKEN))
 566                 panic("%s:  lock 0x%p hasn't been taken",
 567                       caller, l);
 568         if (l->debug.lock_thread != (void *) current_thread())
 569                 panic("%s:  unlocking lock 0x%p, owned by thread %p",
 570                       caller, l, l->debug.lock_thread);
 571         if (l->debug.lock_cpu != mycpu) {
 572                 printf("%s:  unlocking lock 0x%p on cpu 0x%x",
 573                        caller, l, mycpu);
 574                 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
 575                 panic("%s", caller);
 576         }
 577         usl_trace(l, mycpu, pc, caller);
 578
 579         l->debug.unlock_thread = l->debug.lock_thread;
 580         l->debug.lock_thread = INVALID_PC;
 581         l->debug.state &= ~USLOCK_TAKEN;
 582         l->debug.unlock_pc = pc;
 583         l->debug.unlock_cpu = mycpu;
 584 }
 585
 586
 587 /*
 588  *      Debug checks on a usimple_lock just before
 589  *      attempting to acquire it.
 590  *
 591  *      Preemption isn't guaranteed to be disabled.
 592  */
 593 void
 594 usld_lock_try_pre(
 595         usimple_lock_t  l,
 596         pc_t            pc)
 597 {
 598         char    caller[] = "usimple_lock_try";
 599
 600         if (!usld_lock_common_checks(l, caller))
 601                 return;
 602         mp_disable_preemption();
 603         usl_trace(l, cpu_number(), pc, caller);
 604         mp_enable_preemption();
 605 }
 606
 607
 608 /*
 609  *      Debug checks on a usimple_lock just after
 610  *      successfully attempting to acquire it.
 611  *
 612  *      Preemption has been disabled by the
 613  *      lock acquisition attempt, so it's safe
 614  *      to use cpu_number.
 615  */
 616 void
 617 usld_lock_try_post(
 618         usimple_lock_t  l,
 619         pc_t            pc)
 620 {
 621         register int    mycpu;
 622         char    caller[] = "successful usimple_lock_try";
 623
 624         if (!usld_lock_common_checks(l, caller))
 625                 return;
 626
 627         if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
 628                 panic("%s:  lock 0x%p became uninitialized",
 629                       caller, l);
 630         if ((l->debug.state & USLOCK_TAKEN))
 631                 panic("%s:  lock 0x%p became TAKEN by someone else",
 632                       caller, l);
 633
 634         mycpu = cpu_number();
 635         l->debug.lock_thread = (void *) current_thread();
 636         l->debug.state |= USLOCK_TAKEN;
 637         l->debug.lock_pc = pc;
 638         l->debug.lock_cpu = mycpu;
 639
 640         usl_trace(l, mycpu, pc, caller);
 641 }
 642
 643
 644 /*
 645  *      For very special cases, set traced_lock to point to a
 646  *      specific lock of interest.  The result is a series of
 647  *      XPRs showing lock operations on that lock.  The lock_seq
 648  *      value is used to show the order of those operations.
 649  */
 650 usimple_lock_t          traced_lock;
 651 unsigned int            lock_seq;
 652
 653 void
 654 usl_trace(
 655         usimple_lock_t  l,
 656         int             mycpu,
 657         pc_t            pc,
 658         const char *    op_name)
 659 {
 660         if (traced_lock == l) {
 661                 XPR(XPR_SLOCK,
 662                     "seq %d, cpu %d, %s @ %x\n",
 663                     (uintptr_t) lock_seq, (uintptr_t) mycpu,
 664                     (uintptr_t) op_name, (uintptr_t) pc, 0);
 665                 lock_seq++;
 666         }
 667 }
 668
 669
 670 #endif  /* USLOCK_DEBUG */
 671
 672 /*
 673  *      Routine:        lock_alloc
 674  *      Function:
 675  *              Allocate a lock for external users who cannot
 676  *              hard-code the structure definition into their
 677  *              objects.
 678  *              For now just use kalloc, but a zone is probably
 679  *              warranted.
 680  */
 681 lock_t *
 682 lock_alloc(
 683         boolean_t       can_sleep,
 684         unsigned short  tag,
 685         unsigned short  tag1)
 686 {
 687         lock_t          *l;
 688
 689         if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0)
 690           lock_init(l, can_sleep, tag, tag1);
 691         return(l);
 692 }
 693
 694 /*
 695  *      Routine:        lock_free
 696  *      Function:
 697  *              Free a lock allocated for external users.
 698  *              For now just use kfree, but a zone is probably
 699  *              warranted.
 700  */
 701 void
 702 lock_free(
 703         lock_t          *l)
 704 {
 705         kfree(l, sizeof(lock_t));
 706 }
 707
 708
 709 /*
 710  *      Routine:        lock_init
 711  *      Function:
 712  *              Initialize a lock; required before use.
 713  *              Note that clients declare the "struct lock"
 714  *              variables and then initialize them, rather
 715  *              than getting a new one from this module.
 716  */
 717 void
 718 lock_init(
 719         lock_t          *l,
 720         boolean_t       can_sleep,
 721         __unused unsigned short tag,
 722         __unused unsigned short tag1)
 723 {
 724         hw_lock_byte_init(&l->lck_rw_interlock);
 725         l->lck_rw_want_write = FALSE;
 726         l->lck_rw_want_upgrade = FALSE;
 727         l->lck_rw_shared_count = 0;
 728         l->lck_rw_can_sleep = can_sleep;
 729         l->lck_rw_tag = tag;
 730         l->lck_rw_priv_excl = 1;
 731         l->lck_r_waiting = l->lck_w_waiting = 0;
 732 }
 733
 734
 735 /*
 736  *      Sleep locks.  These use the same data structure and algorithm
 737  *      as the spin locks, but the process sleeps while it is waiting
 738  *      for the lock.  These work on uniprocessor systems.
 739  */
 740
 741 #define DECREMENTER_TIMEOUT 1000000
 742
 743 void
 744 lock_write(
 745         register lock_t * l)
 746 {
 747         lck_rw_lock_exclusive(l);
 748 }
 749
 750 void
 751 lock_done(
 752         register lock_t * l)
 753 {
 754         (void) lck_rw_done(l);
 755 }
 756
 757 void
 758 lock_read(
 759         register lock_t * l)
 760 {
 761         lck_rw_lock_shared(l);
 762 }
 763
 764
 765 /*
 766  *      Routine:        lock_read_to_write
 767  *      Function:
 768  *              Improves a read-only lock to one with
 769  *              write permission.  If another reader has
 770  *              already requested an upgrade to a write lock,
 771  *              no lock is held upon return.
 772  *
 773  *              Returns FALSE if the upgrade *failed*.
 774  */
 775
 776 boolean_t
 777 lock_read_to_write(
 778         register lock_t * l)
 779 {
 780         return lck_rw_lock_shared_to_exclusive(l);
 781 }
 782
 783 void
 784 lock_write_to_read(
 785         register lock_t * l)
 786 {
 787         lck_rw_lock_exclusive_to_shared(l);
 788 }
 789
 790
 791
 792 /*
 793  *      Routine:        lck_rw_alloc_init
 794  */
 795 lck_rw_t *
 796 lck_rw_alloc_init(
 797         lck_grp_t       *grp,
 798         lck_attr_t      *attr) {
 799         lck_rw_t        *lck;
 800
 801         if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
 802                 bzero(lck, sizeof(lck_rw_t));
 803                 lck_rw_init(lck, grp, attr);
 804         }
 805
 806         return(lck);
 807 }
 808
 809 /*
 810  *      Routine:        lck_rw_free
 811  */
 812 void
 813 lck_rw_free(
 814         lck_rw_t        *lck,
 815         lck_grp_t       *grp) {
 816         lck_rw_destroy(lck, grp);
 817         kfree(lck, sizeof(lck_rw_t));
 818 }
 819
 820 /*
 821  *      Routine:        lck_rw_init
 822  */
 823 void
 824 lck_rw_init(
 825         lck_rw_t        *lck,
 826         lck_grp_t       *grp,
 827         lck_attr_t      *attr)
 828 {
 829         lck_attr_t      *lck_attr = (attr != LCK_ATTR_NULL) ?
 830                                         attr : &LockDefaultLckAttr;
 831
 832         hw_lock_byte_init(&lck->lck_rw_interlock);
 833         lck->lck_rw_want_write = FALSE;
 834         lck->lck_rw_want_upgrade = FALSE;
 835         lck->lck_rw_shared_count = 0;
 836         lck->lck_rw_can_sleep = TRUE;
 837         lck->lck_r_waiting = lck->lck_w_waiting = 0;
 838         lck->lck_rw_tag = 0;
 839         lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
 840                                 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
 841
 842         lck_grp_reference(grp);
 843         lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
 844 }
 845
 846 /*
 847  *      Routine:        lck_rw_destroy
 848  */
 849 void
 850 lck_rw_destroy(
 851         lck_rw_t        *lck,
 852         lck_grp_t       *grp)
 853 {
 854         if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
 855                 return;
 856         lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
 857         lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
 858         lck_grp_deallocate(grp);
 859         return;
 860 }
 861
 862 /*
 863  *      Sleep locks.  These use the same data structure and algorithm
 864  *      as the spin locks, but the process sleeps while it is waiting
 865  *      for the lock.  These work on uniprocessor systems.
 866  */
 867
 868 #define DECREMENTER_TIMEOUT 1000000
 869
 870 #define RW_LOCK_READER_EVENT(x)         \
 871                 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
 872
 873 #define RW_LOCK_WRITER_EVENT(x)         \
 874                 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
 875
 876 /*
 877  * We need to disable interrupts while holding the mutex interlock
 878  * to prevent an IPI intervening.
 879  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
 880  */
 881 static boolean_t
 882 lck_interlock_lock(lck_rw_t *lck)
 883 {
 884         boolean_t       istate;
 885
 886         istate = ml_set_interrupts_enabled(FALSE);
 887         hw_lock_byte_lock(&lck->lck_rw_interlock);
 888
 889         return istate;
 890 }
 891
 892 static void
 893 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
 894 {
 895         hw_lock_byte_unlock(&lck->lck_rw_interlock);
 896         ml_set_interrupts_enabled(istate);
 897 }
 898
 899 /*
 900  * This inline is used when busy-waiting for an rw lock.
 901  * If interrupts were disabled when the lock primitive was called,
 902  * we poll the IPI handler for pending tlb flushes.
 903  * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
 904  */
 905 static inline void
 906 lck_rw_lock_pause(boolean_t interrupts_enabled)
 907 {
 908         if (!interrupts_enabled)
 909                 handle_pending_TLB_flushes();
 910         cpu_pause();
 911 }
 912
 913
 914 /*
 915  * compute the deadline to spin against when
 916  * waiting for a change of state on a lck_rw_t
 917  */
 918 static inline uint64_t
 919 lck_rw_deadline_for_spin(lck_rw_t *lck)
 920 {
 921         if (lck->lck_rw_can_sleep) {
 922                 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
 923                         /*
 924                          * there are already threads waiting on this lock... this
 925                          * implies that they have spun beyond their deadlines waiting for
 926                          * the desired state to show up so we will not bother spinning at this time...
 927                          *   or
 928                          * the current number of threads sharing this lock exceeds our capacity to run them
 929                          * concurrently and since all states we're going to spin for require the rw_shared_count
 930                          * to be at 0, we'll not bother spinning since the latency for this to happen is
 931                          * unpredictable...
 932                          */
 933                         return (mach_absolute_time());
 934                 }
 935                 return (mach_absolute_time() + MutexSpin);
 936         } else
 937                 return (mach_absolute_time() + (100000LL * 1000000000LL));
 938 }
 939
 940
 941 /*
 942  *      Routine:        lck_rw_lock_exclusive
 943  */
 944 void
 945 lck_rw_lock_exclusive_gen(
 946         lck_rw_t        *lck)
 947 {
 948         uint64_t        deadline = 0;
 949         int             slept = 0;
 950         int             gotlock = 0;
 951         int             lockheld = 0;
 952         wait_result_t   res = 0;
 953         boolean_t       istate = -1;
 954
 955 #if     CONFIG_DTRACE
 956         boolean_t dtrace_ls_initialized = FALSE;
 957         boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
 958         uint64_t wait_interval = 0;
 959         int readers_at_sleep = 0;
 960 #endif
 961
 962         /*
 963          *      Try to acquire the lck_rw_want_write bit.
 964          */
 965         while ( !lck_rw_grab_want(lck)) {
 966
 967 #if     CONFIG_DTRACE
 968                 if (dtrace_ls_initialized == FALSE) {
 969                         dtrace_ls_initialized = TRUE;
 970                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
 971                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
 972                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
 973                         if (dtrace_ls_enabled) {
 974                                 /*
 975                                  * Either sleeping or spinning is happening,
 976                                  *  start a timing of our delay interval now.
 977                                  */
 978                                 readers_at_sleep = lck->lck_rw_shared_count;
 979                                 wait_interval = mach_absolute_time();
 980                         }
 981                 }
 982 #endif
 983                 if (istate == -1)
 984                         istate = ml_get_interrupts_enabled();
 985
 986                 deadline = lck_rw_deadline_for_spin(lck);
 987
 988                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
 989
 990                 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
 991                         lck_rw_lock_pause(istate);
 992
 993                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0);
 994
 995                 if (gotlock)
 996                         break;
 997                 /*
 998                  * if we get here, the deadline has expired w/o us
 999                  * being able to grab the lock exclusively
1000                  * check to see if we're allowed to do a thread_block
1001                  */
1002                 if (lck->lck_rw_can_sleep) {
1003
1004                         istate = lck_interlock_lock(lck);
1005
1006                         if (lck->lck_rw_want_write) {
1007
1008                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1009
1010                                 lck->lck_w_waiting = TRUE;
1011
1012                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1013                                 lck_interlock_unlock(lck, istate);
1014
1015                                 if (res == THREAD_WAITING) {
1016                                         res = thread_block(THREAD_CONTINUE_NULL);
1017                                         slept++;
1018                                 }
1019                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
1020                         } else {
1021                                 lck->lck_rw_want_write = TRUE;
1022                                 lck_interlock_unlock(lck, istate);
1023                                 break;
1024                         }
1025                 }
1026         }
1027         /*
1028          * Wait for readers (and upgrades) to finish...
1029          * the test for these conditions must be done simultaneously with
1030          * a check of the interlock not being held since
1031          * the rw_shared_count will drop to 0 first and then want_upgrade
1032          * will be set to 1 in the shared_to_exclusive scenario... those
1033          * adjustments are done behind the interlock and represent an
1034          * atomic change in state and must be considered as such
1035          * however, once we see the read count at 0, the want_upgrade not set
1036          * and the interlock not held, we are safe to proceed
1037          */
1038         while (lck_rw_held_read_or_upgrade(lck)) {
1039
1040 #if     CONFIG_DTRACE
1041                 /*
1042                  * Either sleeping or spinning is happening, start
1043                  * a timing of our delay interval now.  If we set it
1044                  * to -1 we don't have accurate data so we cannot later
1045                  * decide to record a dtrace spin or sleep event.
1046                  */
1047                 if (dtrace_ls_initialized == FALSE) {
1048                         dtrace_ls_initialized = TRUE;
1049                         dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1050                         dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1051                         dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1052                         if (dtrace_ls_enabled) {
1053                                 /*
1054                                  * Either sleeping or spinning is happening,
1055                                  *  start a timing of our delay interval now.
1056                                  */
1057                                 readers_at_sleep = lck->lck_rw_shared_count;
1058                                 wait_interval = mach_absolute_time();
1059                         }
1060                 }
1061 #endif
1062                 if (istate == -1)
1063                         istate = ml_get_interrupts_enabled();
1064
1065                 deadline = lck_rw_deadline_for_spin(lck);
1066
1067                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1068
1069                 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1070                         lck_rw_lock_pause(istate);
1071
1072                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0);
1073
1074                 if ( !lockheld)
1075                         break;
1076                 /*
1077                  * if we get here, the deadline has expired w/o us
1078                  * being able to grab the lock exclusively
1079                  * check to see if we're allowed to do a thread_block
1080                  */
1081                 if (lck->lck_rw_can_sleep) {
1082
1083                         istate = lck_interlock_lock(lck);
1084
1085                         if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1086                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1087
1088                                 lck->lck_w_waiting = TRUE;
1089
1090                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1091                                 lck_interlock_unlock(lck, istate);
1092
1093                                 if (res == THREAD_WAITING) {
1094                                         res = thread_block(THREAD_CONTINUE_NULL);
1095                                         slept++;
1096                                 }
1097                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
1098                         } else {
1099                                 lck_interlock_unlock(lck, istate);
1100                                 /*
1101                                  * must own the lock now, since we checked for
1102                                  * readers or upgrade owner behind the interlock
1103                                  * no need for a call to 'lck_rw_held_read_or_upgrade'
1104                                  */
1105                                 break;
1106                         }
1107                 }
1108         }
1109
1110 #if     CONFIG_DTRACE
1111         /*
1112          * Decide what latencies we suffered that are Dtrace events.
1113          * If we have set wait_interval, then we either spun or slept.
1114          * At least we get out from under the interlock before we record
1115          * which is the best we can do here to minimize the impact
1116          * of the tracing.
1117          * If we have set wait_interval to -1, then dtrace was not enabled when we
1118          * started sleeping/spinning so we don't record this event.
1119          */
1120         if (dtrace_ls_enabled == TRUE) {
1121                 if (slept == 0) {
1122                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1123                             mach_absolute_time() - wait_interval, 1);
1124                 } else {
1125                         /*
1126                          * For the blocking case, we also record if when we blocked
1127                          * it was held for read or write, and how many readers.
1128                          * Notice that above we recorded this before we dropped
1129                          * the interlock so the count is accurate.
1130                          */
1131                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1132                             mach_absolute_time() - wait_interval, 1,
1133                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1134                 }
1135         }
1136         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1137 #endif
1138 }
1139
1140
1141 /*
1142  *      Routine:        lck_rw_done_gen
1143  *
1144  *      called from the assembly language wrapper...
1145  *      prior_lock_state is the value in the 1st
1146  *      word of the lock at the time of a successful
1147  *      atomic compare and exchange with the new value...
1148  *      it represents the state of the lock before we
1149  *      decremented the rw_shared_count or cleared either
1150  *      rw_want_upgrade or rw_want_write and
1151  *      the lck_x_waiting bits...  since the wrapper
1152  *      routine has already changed the state atomically,
1153  *      we just need to decide if we should
1154  *      wake up anyone and what value to return... we do
1155  *      this by examining the state of the lock before
1156  *      we changed it
1157  */
1158 lck_rw_type_t
1159 lck_rw_done_gen(
1160         lck_rw_t        *lck,
1161         int             prior_lock_state)
1162 {
1163         lck_rw_t        *fake_lck;
1164         lck_rw_type_t   lock_type;
1165
1166         /*
1167          * prior_lock state is a snapshot of the 1st word of the
1168          * lock in question... we'll fake up a pointer to it
1169          * and carefully not access anything beyond whats defined
1170          * in the first word of a lck_rw_t
1171          */
1172         fake_lck = (lck_rw_t *)&prior_lock_state;
1173
1174         if (fake_lck->lck_rw_shared_count <= 1) {
1175                 if (fake_lck->lck_w_waiting)
1176                         thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1177
1178                 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1179                         thread_wakeup(RW_LOCK_READER_EVENT(lck));
1180         }
1181         if (fake_lck->lck_rw_shared_count)
1182                 lock_type = LCK_RW_TYPE_SHARED;
1183         else
1184                 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1185
1186 #if CONFIG_DTRACE
1187         LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1188 #endif
1189
1190         return(lock_type);
1191 }
1192
1193
1194 /*
1195  *      Routine:        lck_rw_unlock
1196  */
1197 void
1198 lck_rw_unlock(
1199         lck_rw_t        *lck,
1200         lck_rw_type_t   lck_rw_type)
1201 {
1202         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1203                 lck_rw_unlock_shared(lck);
1204         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1205                 lck_rw_unlock_exclusive(lck);
1206         else
1207                 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1208 }
1209
1210
1211 /*
1212  *      Routine:        lck_rw_unlock_shared
1213  */
1214 void
1215 lck_rw_unlock_shared(
1216         lck_rw_t        *lck)
1217 {
1218         lck_rw_type_t   ret;
1219
1220         ret = lck_rw_done(lck);
1221
1222         if (ret != LCK_RW_TYPE_SHARED)
1223                 panic("lck_rw_unlock(): lock held in mode: %d\n", ret);
1224 }
1225
1226
1227 /*
1228  *      Routine:        lck_rw_unlock_exclusive
1229  */
1230 void
1231 lck_rw_unlock_exclusive(
1232         lck_rw_t        *lck)
1233 {
1234         lck_rw_type_t   ret;
1235
1236         ret = lck_rw_done(lck);
1237
1238         if (ret != LCK_RW_TYPE_EXCLUSIVE)
1239                 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1240 }
1241
1242
1243 /*
1244  *      Routine:        lck_rw_lock
1245  */
1246 void
1247 lck_rw_lock(
1248         lck_rw_t        *lck,
1249         lck_rw_type_t   lck_rw_type)
1250 {
1251         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1252                 lck_rw_lock_shared(lck);
1253         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1254                 lck_rw_lock_exclusive(lck);
1255         else
1256                 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1257 }
1258
1259
1260 /*
1261  *      Routine:        lck_rw_lock_shared_gen
1262  *      Function:
1263  *              assembly fast path code has determined that this lock
1264  *              is held exclusively... this is where we spin/block
1265  *              until we can acquire the lock in the shared mode
1266  */
1267 void
1268 lck_rw_lock_shared_gen(
1269         lck_rw_t        *lck)
1270 {
1271         uint64_t        deadline = 0;
1272         int             gotlock = 0;
1273         int             slept = 0;
1274         wait_result_t   res = 0;
1275         boolean_t       istate = -1;
1276
1277 #if     CONFIG_DTRACE
1278         uint64_t wait_interval = 0;
1279         int readers_at_sleep = 0;
1280         boolean_t dtrace_ls_initialized = FALSE;
1281         boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1282 #endif
1283
1284         while ( !lck_rw_grab_shared(lck)) {
1285
1286 #if     CONFIG_DTRACE
1287                 if (dtrace_ls_initialized == FALSE) {
1288                         dtrace_ls_initialized = TRUE;
1289                         dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1290                         dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1291                         dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1292                         if (dtrace_ls_enabled) {
1293                                 /*
1294                                  * Either sleeping or spinning is happening,
1295                                  *  start a timing of our delay interval now.
1296                                  */
1297                                 readers_at_sleep = lck->lck_rw_shared_count;
1298                                 wait_interval = mach_absolute_time();
1299                         }
1300                 }
1301 #endif
1302                 if (istate == -1)
1303                         istate = ml_get_interrupts_enabled();
1304
1305                 deadline = lck_rw_deadline_for_spin(lck);
1306
1307                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1308                              (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1309
1310                 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1311                         lck_rw_lock_pause(istate);
1312
1313                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1314                              (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1315
1316                 if (gotlock)
1317                         break;
1318                 /*
1319                  * if we get here, the deadline has expired w/o us
1320                  * being able to grab the lock for read
1321                  * check to see if we're allowed to do a thread_block
1322                  */
1323                 if (lck->lck_rw_can_sleep) {
1324
1325                         istate = lck_interlock_lock(lck);
1326
1327                         if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1328                             ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1329
1330                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1331                                              (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1332
1333                                 lck->lck_r_waiting = TRUE;
1334
1335                                 res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1336                                 lck_interlock_unlock(lck, istate);
1337
1338                                 if (res == THREAD_WAITING) {
1339                                         res = thread_block(THREAD_CONTINUE_NULL);
1340                                         slept++;
1341                                 }
1342                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1343                                              (int)lck, res, slept, 0, 0);
1344                         } else {
1345                                 lck->lck_rw_shared_count++;
1346                                 lck_interlock_unlock(lck, istate);
1347                                 break;
1348                         }
1349                 }
1350         }
1351
1352 #if     CONFIG_DTRACE
1353         if (dtrace_ls_enabled == TRUE) {
1354                 if (slept == 0) {
1355                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1356                 } else {
1357                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1358                             mach_absolute_time() - wait_interval, 0,
1359                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1360                 }
1361         }
1362         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1363 #endif
1364 }
1365
1366
1367 /*
1368  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1369  *      Function:
1370  *              assembly fast path code has already dropped our read
1371  *              count and determined that someone else owns 'lck_rw_want_upgrade'
1372  *              if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1373  *              all we need to do here is determine if a wakeup is needed
1374  */
1375 boolean_t
1376 lck_rw_lock_shared_to_exclusive_failure(
1377         lck_rw_t        *lck,
1378         int             prior_lock_state)
1379 {
1380         lck_rw_t        *fake_lck;
1381
1382         /*
1383          * prior_lock state is a snapshot of the 1st word of the
1384          * lock in question... we'll fake up a pointer to it
1385          * and carefully not access anything beyond whats defined
1386          * in the first word of a lck_rw_t
1387          */
1388         fake_lck = (lck_rw_t *)&prior_lock_state;
1389
1390         if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1391                 /*
1392                  *      Someone else has requested upgrade.
1393                  *      Since we've released the read lock, wake
1394                  *      him up if he's blocked waiting
1395                  */
1396                 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1397         }
1398         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1399                      (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1400
1401         return (FALSE);
1402 }
1403
1404
1405 /*
1406  *      Routine:        lck_rw_lock_shared_to_exclusive_failure
1407  *      Function:
1408  *              assembly fast path code has already dropped our read
1409  *              count and successfully acquired 'lck_rw_want_upgrade'
1410  *              we just need to wait for the rest of the readers to drain
1411  *              and then we can return as the exclusive holder of this lock
1412  */
1413 boolean_t
1414 lck_rw_lock_shared_to_exclusive_success(
1415         lck_rw_t        *lck)
1416 {
1417         uint64_t        deadline = 0;
1418         int             slept = 0;
1419         int             still_shared = 0;
1420         wait_result_t   res;
1421         boolean_t       istate = -1;
1422
1423 #if     CONFIG_DTRACE
1424         uint64_t wait_interval = 0;
1425         int readers_at_sleep = 0;
1426         boolean_t dtrace_ls_initialized = FALSE;
1427         boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1428 #endif
1429
1430         while (lck->lck_rw_shared_count != 0) {
1431
1432 #if     CONFIG_DTRACE
1433                 if (dtrace_ls_initialized == FALSE) {
1434                         dtrace_ls_initialized = TRUE;
1435                         dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1436                         dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1437                         dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1438                         if (dtrace_ls_enabled) {
1439                                 /*
1440                                  * Either sleeping or spinning is happening,
1441                                  *  start a timing of our delay interval now.
1442                                  */
1443                                 readers_at_sleep = lck->lck_rw_shared_count;
1444                                 wait_interval = mach_absolute_time();
1445                         }
1446                 }
1447 #endif
1448                 if (istate == -1)
1449                         istate = ml_get_interrupts_enabled();
1450
1451                 deadline = lck_rw_deadline_for_spin(lck);
1452
1453                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1454                              (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1455
1456                 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1457                         lck_rw_lock_pause(istate);
1458
1459                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1460                              (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1461
1462                 if ( !still_shared)
1463                         break;
1464                 /*
1465                  * if we get here, the deadline has expired w/o
1466                  * the rw_shared_count having drained to 0
1467                  * check to see if we're allowed to do a thread_block
1468                  */
1469                 if (lck->lck_rw_can_sleep) {
1470
1471                         istate = lck_interlock_lock(lck);
1472
1473                         if (lck->lck_rw_shared_count != 0) {
1474                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1475                                              (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1476
1477                                 lck->lck_w_waiting = TRUE;
1478
1479                                 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1480                                 lck_interlock_unlock(lck, istate);
1481
1482                                 if (res == THREAD_WAITING) {
1483                                         res = thread_block(THREAD_CONTINUE_NULL);
1484                                         slept++;
1485                                 }
1486                                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1487                                              (int)lck, res, slept, 0, 0);
1488                         } else {
1489                                 lck_interlock_unlock(lck, istate);
1490                                 break;
1491                         }
1492                 }
1493         }
1494 #if     CONFIG_DTRACE
1495         /*
1496          * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1497          */
1498         if (dtrace_ls_enabled == TRUE) {
1499                 if (slept == 0) {
1500                         LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1501                 } else {
1502                         LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1503                             mach_absolute_time() - wait_interval, 1,
1504                             (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1505                 }
1506         }
1507         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1508 #endif
1509         return (TRUE);
1510 }
1511
1512
1513 /*
1514  *      Routine:        lck_rw_lock_exclusive_to_shared
1515  *      Function:
1516  *              assembly fast path has already dropped
1517  *              our exclusive state and bumped lck_rw_shared_count
1518  *              all we need to do here is determine if anyone
1519  *              needs to be awakened.
1520  */
1521 void
1522 lck_rw_lock_exclusive_to_shared_gen(
1523         lck_rw_t        *lck,
1524         int             prior_lock_state)
1525 {
1526         lck_rw_t        *fake_lck;
1527
1528         /*
1529          * prior_lock state is a snapshot of the 1st word of the
1530          * lock in question... we'll fake up a pointer to it
1531          * and carefully not access anything beyond whats defined
1532          * in the first word of a lck_rw_t
1533          */
1534         fake_lck = (lck_rw_t *)&prior_lock_state;
1535
1536         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1537                              (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1538
1539         /*
1540          * don't wake up anyone waiting to take the lock exclusively
1541          * since we hold a read count... when the read count drops to 0,
1542          * the writers will be woken.
1543          *
1544          * wake up any waiting readers if we don't have any writers waiting,
1545          * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1546          */
1547         if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1548                 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1549
1550         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1551                              (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1552
1553 #if CONFIG_DTRACE
1554         LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1555 #endif
1556 }
1557
1558
1559 /*
1560  *      Routine:        lck_rw_try_lock
1561  */
1562 boolean_t
1563 lck_rw_try_lock(
1564         lck_rw_t        *lck,
1565         lck_rw_type_t   lck_rw_type)
1566 {
1567         if (lck_rw_type == LCK_RW_TYPE_SHARED)
1568                 return(lck_rw_try_lock_shared(lck));
1569         else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1570                 return(lck_rw_try_lock_exclusive(lck));
1571         else
1572                 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1573         return(FALSE);
1574 }
1575
1576
1577 void
1578 lck_rw_assert(
1579         lck_rw_t        *lck,
1580         unsigned int    type)
1581 {
1582         switch (type) {
1583         case LCK_RW_ASSERT_SHARED:
1584                 if (lck->lck_rw_shared_count != 0) {
1585                         return;
1586                 }
1587                 break;
1588         case LCK_RW_ASSERT_EXCLUSIVE:
1589                 if ((lck->lck_rw_want_write ||
1590                      lck->lck_rw_want_upgrade) &&
1591                     lck->lck_rw_shared_count == 0) {
1592                         return;
1593                 }
1594                 break;
1595         case LCK_RW_ASSERT_HELD:
1596                 if (lck->lck_rw_want_write ||
1597                     lck->lck_rw_want_upgrade ||
1598                     lck->lck_rw_shared_count != 0) {
1599                         return;
1600                 }
1601                 break;
1602         default:
1603                 break;
1604         }
1605
1606         panic("rw lock (%p) not held (mode=%u), first word %08x\n", lck, type, *(uint32_t *)lck);
1607 }
1608
1609 /*
1610  *      Routine:        lck_mtx_alloc_init
1611  */
1612 lck_mtx_t *
1613 lck_mtx_alloc_init(
1614         lck_grp_t       *grp,
1615         lck_attr_t      *attr)
1616 {
1617         lck_mtx_t       *lck;
1618
1619         if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
1620                 lck_mtx_init(lck, grp, attr);
1621
1622         return(lck);
1623 }
1624
1625 /*
1626  *      Routine:        lck_mtx_free
1627  */
1628 void
1629 lck_mtx_free(
1630         lck_mtx_t       *lck,
1631         lck_grp_t       *grp)
1632 {
1633         lck_mtx_destroy(lck, grp);
1634         kfree(lck, sizeof(lck_mtx_t));
1635 }
1636
1637 /*
1638  *      Routine:        lck_mtx_ext_init
1639  */
1640 static void
1641 lck_mtx_ext_init(
1642         lck_mtx_ext_t   *lck,
1643         lck_grp_t       *grp,
1644         lck_attr_t      *attr)
1645 {
1646         bzero((void *)lck, sizeof(lck_mtx_ext_t));
1647
1648         if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1649                 lck->lck_mtx_deb.type = MUTEX_TAG;
1650                 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
1651         }
1652
1653         lck->lck_mtx_grp = grp;
1654
1655         if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
1656                  lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
1657
1658         lck->lck_mtx.lck_mtx_ptr = (void *)LCK_MTX_PTR_EXTENDED;
1659 }
1660
1661 /*
1662  *      Routine:        lck_mtx_init
1663  */
1664 void
1665 lck_mtx_init(
1666         lck_mtx_t       *lck,
1667         lck_grp_t       *grp,
1668         lck_attr_t      *attr)
1669 {
1670         lck_mtx_ext_t   *lck_ext;
1671         lck_attr_t      *lck_attr;
1672
1673         if (attr != LCK_ATTR_NULL)
1674                 lck_attr = attr;
1675         else
1676                 lck_attr = &LockDefaultLckAttr;
1677
1678         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1679                 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
1680                         lck_mtx_ext_init(lck_ext, grp, lck_attr);
1681                         lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1682                         lck->lck_mtx_ptr = lck_ext;
1683                         lck->lck_mtx_ilocked = 1;
1684                 }
1685         } else {
1686                 lck->lck_mtx_owner = 0;
1687                 lck->lck_mtx_ptr = 0;
1688                 lck->lck_mtx_waiters = 0;
1689                 lck->lck_mtx_pri = 0;
1690                 lck->lck_mtx_ilocked = 0;
1691                 lck->lck_mtx_mlocked = 0;
1692                 lck->lck_mtx_promoted = 0;
1693                 lck->lck_mtx_spin = 0;
1694         }
1695         lck_grp_reference(grp);
1696         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1697 }
1698
1699 /*
1700  *      Routine:        lck_mtx_init_ext
1701  */
1702 void
1703 lck_mtx_init_ext(
1704         lck_mtx_t       *lck,
1705         lck_mtx_ext_t   *lck_ext,
1706         lck_grp_t       *grp,
1707         lck_attr_t      *attr)
1708 {
1709         lck_attr_t      *lck_attr;
1710
1711         if (attr != LCK_ATTR_NULL)
1712                 lck_attr = attr;
1713         else
1714                 lck_attr = &LockDefaultLckAttr;
1715
1716         if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1717                 lck_mtx_ext_init(lck_ext, grp, lck_attr);
1718                 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1719                 lck->lck_mtx_ptr = lck_ext;
1720                 lck->lck_mtx_ilocked = 1;
1721         } else {
1722                 lck->lck_mtx_owner = 0;
1723                 lck->lck_mtx_ptr = 0;
1724                 lck->lck_mtx_waiters = 0;
1725                 lck->lck_mtx_pri = 0;
1726                 lck->lck_mtx_ilocked = 0;
1727                 lck->lck_mtx_mlocked = 0;
1728                 lck->lck_mtx_promoted = 0;
1729                 lck->lck_mtx_spin = 0;
1730         }
1731         lck_grp_reference(grp);
1732         lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1733 }
1734
1735 /*
1736  *      Routine:        lck_mtx_destroy
1737  */
1738 void
1739 lck_mtx_destroy(
1740         lck_mtx_t       *lck,
1741         lck_grp_t       *grp)
1742 {
1743         boolean_t lck_is_indirect;
1744
1745         if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
1746                 return;
1747         lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1748
1749         lck_mtx_lock_mark_destroyed(lck);
1750
1751         if (lck_is_indirect)
1752                 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
1753         lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1754         lck_grp_deallocate(grp);
1755         return;
1756 }
1757
1758
1759 #define LCK_MTX_LCK_WAIT_CODE           0x20
1760 #define LCK_MTX_LCK_WAKEUP_CODE         0x21
1761 #define LCK_MTX_LCK_SPIN_CODE           0x22
1762 #define LCK_MTX_LCK_ACQUIRE_CODE        0x23
1763 #define LCK_MTX_LCK_DEMOTE_CODE         0x24
1764
1765
1766 /*
1767  * Routine:     lck_mtx_unlock_wakeup_x86
1768  *
1769  * Invoked on unlock when there is contention.
1770  *
1771  */
1772 void
1773 lck_mtx_unlock_wakeup_x86 (
1774         lck_mtx_t       *mutex,
1775         int             owner_was_promoted)
1776 {
1777
1778         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, (int)mutex, owner_was_promoted, mutex->lck_mtx_waiters, 0, 0);
1779
1780         if (lck_mtx_lock_decr_waiter(mutex))
1781                 thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
1782
1783         if (owner_was_promoted) {
1784                 thread_t        thread = current_thread();
1785
1786
1787                 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), thread->promotions,
1788                              thread->sched_mode & TH_MODE_PROMOTED, 0, 0);
1789
1790                 if (thread->promotions > 0) {
1791                         spl_t   s = splsched();
1792
1793                         thread_lock(thread);
1794
1795                         if (--thread->promotions == 0 && (thread->sched_mode & TH_MODE_PROMOTED)) {
1796
1797                                 thread->sched_mode &= ~TH_MODE_PROMOTED;
1798
1799                                 if (thread->sched_mode & TH_MODE_ISDEPRESSED) {
1800                                         KERNEL_DEBUG_CONSTANT(
1801                                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
1802                                                 thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
1803
1804                                         set_sched_pri(thread, DEPRESSPRI);
1805                                 }
1806                                 else {
1807                                         if (thread->priority < thread->sched_pri) {
1808                                                 KERNEL_DEBUG_CONSTANT(
1809                                                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
1810                                                         thread->sched_pri, thread->priority, 0, mutex, 0);
1811
1812                                                 compute_priority(thread, FALSE);
1813                                         }
1814                                 }
1815                         }
1816                         thread_unlock(thread);
1817                         splx(s);
1818                 }
1819         }
1820         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, (int)mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1821 }
1822
1823
1824 /*
1825  * Routine:     lck_mtx_lock_acquire_x86
1826  *
1827  * Invoked on acquiring the mutex when there is
1828  * contention.
1829  * mutex is owned...  interlock is not held
1830  */
1831 void
1832 lck_mtx_lock_acquire_x86(
1833         lck_mtx_t       *mutex)
1834 {
1835         thread_t        thread = current_thread();
1836         integer_t       priority;
1837
1838         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, (int)mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1839
1840         priority = lck_mtx_lock_get_pri(mutex);
1841
1842         if (thread->sched_pri < priority) {
1843
1844                 if (lck_mtx_lock_mark_promoted(mutex)) {
1845                         spl_t   s = splsched();
1846
1847                         thread_lock(thread);
1848
1849                         if (thread->sched_pri < priority) {
1850
1851                                 KERNEL_DEBUG_CONSTANT(
1852                                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
1853                                         thread->sched_pri, priority, 0, mutex, 0);
1854
1855                                 set_sched_pri(thread, priority);
1856                         }
1857                         thread->promotions++;
1858                         thread->sched_mode |= TH_MODE_PROMOTED;
1859
1860                         thread_unlock(thread);
1861                         splx(s);
1862                 }
1863         }
1864         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END, (int)mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1865 }
1866
1867
1868
1869 /*
1870  * Routine:     lck_mtx_lock_spinwait_x86
1871  *
1872  * Invoked trying to acquire a mutex when there is contention but
1873  * the holder is running on another processor. We spin for up to a maximum
1874  * time waiting for the lock to be released.
1875  *
1876  * Called with the interlock unlocked.
1877  */
1878 int
1879 lck_mtx_lock_spinwait_x86(
1880         lck_mtx_t       *mutex)
1881 {
1882         thread_t        holder;
1883         uint64_t        deadline;
1884         int             retval = 1;
1885         int             loopcount = 0;
1886
1887         KERNEL_DEBUG(
1888                 MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
1889                 (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
1890
1891         deadline = mach_absolute_time() + MutexSpin;
1892
1893         /*
1894          * Spin while:
1895          *   - mutex is locked, and
1896          *   - its locked as a spin lock, and
1897          *   - owner is running on another processor, and
1898          *   - owner (processor) is not idling, and
1899          *   - we haven't spun for long enough.
1900          */
1901         do {
1902                 if (lck_mtx_lock_grab_mutex(mutex)) {
1903                         retval = 0;
1904                         break;
1905                 }
1906                 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
1907
1908                         if ( !(holder->machine.specFlags & OnProc) ||
1909                              (holder->state & TH_IDLE)) {
1910                                 if (loopcount == 0)
1911                                         retval = 2;
1912                                 break;
1913                         }
1914                 }
1915                 cpu_pause();
1916
1917                 loopcount++;
1918
1919         } while (mach_absolute_time() < deadline);
1920
1921
1922 #if     CONFIG_DTRACE
1923         /*
1924          * We've already kept a count via deadline of how long we spun.
1925          * If dtrace is active, then we compute backwards to decide how
1926          * long we spun.
1927          *
1928          * Note that we record a different probe id depending on whether
1929          * this is a direct or indirect mutex.  This allows us to
1930          * penalize only lock groups that have debug/stats enabled
1931          * with dtrace processing if desired.
1932          */
1933         if (mutex->lck_mtx_ptr != (void *)LCK_MTX_PTR_EXTENDED) {
1934                 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
1935                     mach_absolute_time() - (deadline - MutexSpin));
1936         } else {
1937                 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
1938                     mach_absolute_time() - (deadline - MutexSpin));
1939         }
1940         /* The lockstat acquire event is recorded by the assembly code beneath us. */
1941 #endif
1942
1943         KERNEL_DEBUG(
1944                 MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
1945                 (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
1946
1947         return retval;
1948 }
1949
1950
1951
1952 /*
1953  * Routine:     lck_mtx_lock_wait_x86
1954  *
1955  * Invoked in order to wait on contention.
1956  *
1957  * Called with the interlock locked and
1958  * returns it unlocked.
1959  */
1960 void
1961 lck_mtx_lock_wait_x86 (
1962         lck_mtx_t       *mutex)
1963 {
1964         thread_t        self = current_thread();
1965         thread_t        holder;
1966         integer_t       priority;
1967         integer_t       old_lck_mtx_pri;
1968         spl_t           s;
1969 #if     CONFIG_DTRACE
1970         uint64_t        sleep_start = 0;
1971
1972         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
1973                 sleep_start = mach_absolute_time();
1974         }
1975 #endif
1976         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
1977
1978         priority = self->sched_pri;
1979
1980         if (priority < self->priority)
1981                 priority = self->priority;
1982         if (priority < BASEPRI_DEFAULT)
1983                 priority = BASEPRI_DEFAULT;
1984
1985         if (mutex->lck_mtx_waiters == 0)
1986                 old_lck_mtx_pri = 0;
1987         else
1988                 old_lck_mtx_pri = mutex->lck_mtx_pri;
1989
1990         if (old_lck_mtx_pri < priority)
1991                 mutex->lck_mtx_pri = priority;
1992
1993         if ( (holder = (thread_t)mutex->lck_mtx_owner) ) {
1994
1995                 s = splsched();
1996                 thread_lock(holder);
1997
1998                 if (holder->sched_pri < priority) {
1999                         KERNEL_DEBUG_CONSTANT(
2000                                 MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2001                                 holder->sched_pri, priority, holder, mutex, 0);
2002
2003                         set_sched_pri(holder, priority);
2004
2005                         if (mutex->lck_mtx_promoted == 0) {
2006                                 holder->promotions++;
2007                                 holder->sched_mode |= TH_MODE_PROMOTED;
2008
2009                                 mutex->lck_mtx_promoted = 1;
2010                         }
2011                 }
2012                 thread_unlock(holder);
2013                 splx(s);
2014         }
2015         mutex->lck_mtx_waiters++;
2016
2017         assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
2018
2019         lck_mtx_ilk_unlock(mutex);
2020
2021         thread_block(THREAD_CONTINUE_NULL);
2022
2023         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, (int)mutex, (int)mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
2024
2025 #if     CONFIG_DTRACE
2026         /*
2027          * Record the Dtrace lockstat probe for blocking, block time
2028          * measured from when we were entered.
2029          */
2030         if (sleep_start) {
2031                 if (mutex->lck_mtx_ptr != (void *)LCK_MTX_PTR_EXTENDED) {
2032                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2033                             mach_absolute_time() - sleep_start);
2034                 } else {
2035                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2036                             mach_absolute_time() - sleep_start);
2037                 }
2038         }
2039 #endif
2040 }
2041
2042
2043 #if     MACH_KDB
2044
2045 void
2046 db_show_one_lock(
2047         lock_t  *lock)
2048 {
2049         db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ",
2050                   lock->lck_rw_shared_count,
2051                   lock->lck_rw_want_upgrade ? "" : "!",
2052                   lock->lck_rw_want_write ? "" : "!");
2053         db_printf("%swaiting, %scan_sleep\n",
2054                   (lock->lck_r_waiting || lock->lck_w_waiting) ? "" : "!",
2055                   lock->lck_rw_can_sleep ? "" : "!");
2056         db_printf("Interlock:\n");
2057         db_show_one_simple_lock((db_expr_t) ((vm_offset_t)simple_lock_addr(lock->lck_rw_interlock)),
2058                         TRUE, (db_expr_t)0, (char *)0);
2059 }
2060
2061 /*
2062  * Routines to print out simple_locks and mutexes in a nicely-formatted
2063  * fashion.
2064  */
2065
2066 const char *simple_lock_labels =        "ENTRY    ILK THREAD   DURATION CALLER";
2067
2068 void
2069 db_show_one_simple_lock (
2070         db_expr_t       addr,
2071         boolean_t       have_addr,
2072         __unused db_expr_t      count,
2073         __unused char           * modif)
2074 {
2075         simple_lock_t   saddr = (simple_lock_t) ((vm_offset_t) addr);
2076
2077         if (saddr == (simple_lock_t)0 || !have_addr) {
2078                 db_error ("No simple_lock\n");
2079         }
2080 #if     USLOCK_DEBUG
2081         else if (saddr->lock_type != USLOCK_TAG)
2082                 db_error ("Not a simple_lock\n");
2083 #endif  /* USLOCK_DEBUG */
2084
2085         db_printf ("%s\n", simple_lock_labels);
2086         db_print_simple_lock (saddr);
2087 }
2088
2089 void
2090 db_print_simple_lock (
2091         simple_lock_t   addr)
2092 {
2093
2094         db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock));
2095 #if     USLOCK_DEBUG
2096         db_printf (" %08x", addr->debug.lock_thread);
2097         db_printf (" %08x ", addr->debug.duration[1]);
2098         db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY);
2099 #endif  /* USLOCK_DEBUG */
2100         db_printf ("\n");
2101 }
2102
2103 #endif  /* MACH_KDB */