osfmk/kern/locks.c

   1 /*
   2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56
  57 #define ATOMIC_PRIVATE 1
  58 #define LOCK_PRIVATE 1
  59
  60 #include <mach_ldebug.h>
  61 #include <debug.h>
  62
  63 #include <mach/kern_return.h>
  64 #include <mach/mach_host_server.h>
  65 #include <mach_debug/lockgroup_info.h>
  66
  67 #include <kern/locks.h>
  68 #include <kern/misc_protos.h>
  69 #include <kern/kalloc.h>
  70 #include <kern/thread.h>
  71 #include <kern/processor.h>
  72 #include <kern/sched_prim.h>
  73 #include <kern/debug.h>
  74 #include <machine/atomic.h>
  75 #include <machine/machine_cpu.h>
  76 #include <string.h>
  77
  78
  79 #include <sys/kdebug.h>
  80
  81 #if     CONFIG_DTRACE
  82 /*
  83  * We need only enough declarations from the BSD-side to be able to
  84  * test if our probe is active, and to call __dtrace_probe().  Setting
  85  * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
  86  */
  87 #define NEED_DTRACE_DEFS
  88 #include <../bsd/sys/lockstat.h>
  89 #endif
  90
  91 #define LCK_MTX_SLEEP_CODE              0
  92 #define LCK_MTX_SLEEP_DEADLINE_CODE     1
  93 #define LCK_MTX_LCK_WAIT_CODE           2
  94 #define LCK_MTX_UNLCK_WAKEUP_CODE       3
  95
  96 #if MACH_LDEBUG
  97 #define ALIGN_TEST(p,t) do{if((uintptr_t)p&(sizeof(t)-1)) __builtin_trap();}while(0)
  98 #else
  99 #define ALIGN_TEST(p,t) do{}while(0)
 100 #endif
 101
 102 /* Silence the volatile to _Atomic cast warning */
 103 #define ATOMIC_CAST(t,p) ((_Atomic t*)(uintptr_t)(p))
 104
 105 /* Enforce program order of loads and stores. */
 106 #define ordered_load(target, type) \
 107                 __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed)
 108 #define ordered_store(target, type, value) \
 109                 __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed)
 110
 111 #define ordered_load_hw(lock)                   ordered_load(&(lock)->lock_data, uintptr_t)
 112 #define ordered_store_hw(lock, value)   ordered_store(&(lock)->lock_data, uintptr_t, (value))
 113
 114 #define NOINLINE                __attribute__((noinline))
 115
 116
 117 static queue_head_t     lck_grp_queue;
 118 static unsigned int     lck_grp_cnt;
 119
 120 decl_lck_mtx_data(static,lck_grp_lock)
 121 static lck_mtx_ext_t lck_grp_lock_ext;
 122
 123 lck_grp_attr_t  LockDefaultGroupAttr;
 124 lck_grp_t               LockCompatGroup;
 125 lck_attr_t              LockDefaultLckAttr;
 126
 127 #if CONFIG_DTRACE && __SMP__
 128 #if defined (__x86_64__)
 129 uint64_t dtrace_spin_threshold = 500; // 500ns
 130 #elif defined(__arm__) || defined(__arm64__)
 131 uint64_t dtrace_spin_threshold = LOCK_PANIC_TIMEOUT / 1000000; // 500ns
 132 #endif
 133 #endif
 134
 135 /*
 136  * Routine:     lck_mod_init
 137  */
 138
 139 void
 140 lck_mod_init(
 141         void)
 142 {
 143         /*
 144          * Obtain "lcks" options:this currently controls lock statistics
 145          */
 146         if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts)))
 147                 LcksOpts = 0;
 148
 149
 150 #if (DEVELOPMENT || DEBUG) && defined(__x86_64__)
 151         if (!PE_parse_boot_argn("-disable_mtx_chk", &LckDisablePreemptCheck, sizeof (LckDisablePreemptCheck)))
 152                 LckDisablePreemptCheck = 0;
 153 #endif /* (DEVELOPMENT || DEBUG) && defined(__x86_64__) */
 154
 155         queue_init(&lck_grp_queue);
 156
 157         /*
 158          * Need to bootstrap the LockCompatGroup instead of calling lck_grp_init() here. This avoids
 159          * grabbing the lck_grp_lock before it is initialized.
 160          */
 161
 162         bzero(&LockCompatGroup, sizeof(lck_grp_t));
 163         (void) strncpy(LockCompatGroup.lck_grp_name, "Compatibility APIs", LCK_GRP_MAX_NAME);
 164
 165         if (LcksOpts & enaLkStat)
 166                 LockCompatGroup.lck_grp_attr = LCK_GRP_ATTR_STAT;
 167     else
 168                 LockCompatGroup.lck_grp_attr = LCK_ATTR_NONE;
 169
 170         LockCompatGroup.lck_grp_refcnt = 1;
 171
 172         enqueue_tail(&lck_grp_queue, (queue_entry_t)&LockCompatGroup);
 173         lck_grp_cnt = 1;
 174
 175         lck_grp_attr_setdefault(&LockDefaultGroupAttr);
 176         lck_attr_setdefault(&LockDefaultLckAttr);
 177
 178         lck_mtx_init_ext(&lck_grp_lock, &lck_grp_lock_ext, &LockCompatGroup, &LockDefaultLckAttr);
 179 }
 180
 181 /*
 182  * Routine:     lck_grp_attr_alloc_init
 183  */
 184
 185 lck_grp_attr_t  *
 186 lck_grp_attr_alloc_init(
 187         void)
 188 {
 189         lck_grp_attr_t  *attr;
 190
 191         if ((attr = (lck_grp_attr_t *)kalloc(sizeof(lck_grp_attr_t))) != 0)
 192                 lck_grp_attr_setdefault(attr);
 193
 194         return(attr);
 195 }
 196
 197
 198 /*
 199  * Routine:     lck_grp_attr_setdefault
 200  */
 201
 202 void
 203 lck_grp_attr_setdefault(
 204         lck_grp_attr_t  *attr)
 205 {
 206         if (LcksOpts & enaLkStat)
 207                 attr->grp_attr_val = LCK_GRP_ATTR_STAT;
 208         else
 209                 attr->grp_attr_val = 0;
 210 }
 211
 212
 213 /*
 214  * Routine:     lck_grp_attr_setstat
 215  */
 216
 217 void
 218 lck_grp_attr_setstat(
 219         lck_grp_attr_t  *attr)
 220 {
 221         (void)hw_atomic_or(&attr->grp_attr_val, LCK_GRP_ATTR_STAT);
 222 }
 223
 224
 225 /*
 226  * Routine:     lck_grp_attr_free
 227  */
 228
 229 void
 230 lck_grp_attr_free(
 231         lck_grp_attr_t  *attr)
 232 {
 233         kfree(attr, sizeof(lck_grp_attr_t));
 234 }
 235
 236
 237 /*
 238  * Routine: lck_grp_alloc_init
 239  */
 240
 241 lck_grp_t *
 242 lck_grp_alloc_init(
 243         const char*     grp_name,
 244         lck_grp_attr_t  *attr)
 245 {
 246         lck_grp_t       *grp;
 247
 248         if ((grp = (lck_grp_t *)kalloc(sizeof(lck_grp_t))) != 0)
 249                 lck_grp_init(grp, grp_name, attr);
 250
 251         return(grp);
 252 }
 253
 254 /*
 255  * Routine: lck_grp_init
 256  */
 257
 258 void
 259 lck_grp_init(lck_grp_t * grp, const char * grp_name, lck_grp_attr_t * attr)
 260 {
 261         /* make sure locking infrastructure has been initialized */
 262         assert(lck_grp_cnt > 0);
 263
 264         bzero((void *)grp, sizeof(lck_grp_t));
 265
 266         (void)strlcpy(grp->lck_grp_name, grp_name, LCK_GRP_MAX_NAME);
 267
 268         if (attr != LCK_GRP_ATTR_NULL)
 269                 grp->lck_grp_attr = attr->grp_attr_val;
 270         else if (LcksOpts & enaLkStat)
 271                 grp->lck_grp_attr = LCK_GRP_ATTR_STAT;
 272         else
 273                 grp->lck_grp_attr = LCK_ATTR_NONE;
 274
 275         grp->lck_grp_refcnt = 1;
 276
 277         lck_mtx_lock(&lck_grp_lock);
 278         enqueue_tail(&lck_grp_queue, (queue_entry_t)grp);
 279         lck_grp_cnt++;
 280         lck_mtx_unlock(&lck_grp_lock);
 281 }
 282
 283 /*
 284  * Routine:     lck_grp_free
 285  */
 286
 287 void
 288 lck_grp_free(
 289         lck_grp_t       *grp)
 290 {
 291         lck_mtx_lock(&lck_grp_lock);
 292         lck_grp_cnt--;
 293         (void)remque((queue_entry_t)grp);
 294         lck_mtx_unlock(&lck_grp_lock);
 295         lck_grp_deallocate(grp);
 296 }
 297
 298
 299 /*
 300  * Routine:     lck_grp_reference
 301  */
 302
 303 void
 304 lck_grp_reference(
 305         lck_grp_t       *grp)
 306 {
 307         (void)hw_atomic_add(&grp->lck_grp_refcnt, 1);
 308 }
 309
 310
 311 /*
 312  * Routine:     lck_grp_deallocate
 313  */
 314
 315 void
 316 lck_grp_deallocate(
 317         lck_grp_t       *grp)
 318 {
 319         if (hw_atomic_sub(&grp->lck_grp_refcnt, 1) == 0)
 320                 kfree(grp, sizeof(lck_grp_t));
 321 }
 322
 323 /*
 324  * Routine:     lck_grp_lckcnt_incr
 325  */
 326
 327 void
 328 lck_grp_lckcnt_incr(
 329         lck_grp_t       *grp,
 330         lck_type_t      lck_type)
 331 {
 332         unsigned int    *lckcnt;
 333
 334         switch (lck_type) {
 335         case LCK_TYPE_SPIN:
 336                 lckcnt = &grp->lck_grp_spincnt;
 337                 break;
 338         case LCK_TYPE_MTX:
 339                 lckcnt = &grp->lck_grp_mtxcnt;
 340                 break;
 341         case LCK_TYPE_RW:
 342                 lckcnt = &grp->lck_grp_rwcnt;
 343                 break;
 344         default:
 345                 return panic("lck_grp_lckcnt_incr(): invalid lock type: %d\n", lck_type);
 346         }
 347
 348         (void)hw_atomic_add(lckcnt, 1);
 349 }
 350
 351 /*
 352  * Routine:     lck_grp_lckcnt_decr
 353  */
 354
 355 void
 356 lck_grp_lckcnt_decr(
 357         lck_grp_t       *grp,
 358         lck_type_t      lck_type)
 359 {
 360         unsigned int    *lckcnt;
 361         int             updated;
 362
 363         switch (lck_type) {
 364         case LCK_TYPE_SPIN:
 365                 lckcnt = &grp->lck_grp_spincnt;
 366                 break;
 367         case LCK_TYPE_MTX:
 368                 lckcnt = &grp->lck_grp_mtxcnt;
 369                 break;
 370         case LCK_TYPE_RW:
 371                 lckcnt = &grp->lck_grp_rwcnt;
 372                 break;
 373         default:
 374                 panic("lck_grp_lckcnt_decr(): invalid lock type: %d\n", lck_type);
 375                 return;
 376         }
 377
 378         updated = (int)hw_atomic_sub(lckcnt, 1);
 379         assert(updated >= 0);
 380 }
 381
 382 /*
 383  * Routine:     lck_attr_alloc_init
 384  */
 385
 386 lck_attr_t *
 387 lck_attr_alloc_init(
 388         void)
 389 {
 390         lck_attr_t      *attr;
 391
 392         if ((attr = (lck_attr_t *)kalloc(sizeof(lck_attr_t))) != 0)
 393                 lck_attr_setdefault(attr);
 394
 395         return(attr);
 396 }
 397
 398
 399 /*
 400  * Routine:     lck_attr_setdefault
 401  */
 402
 403 void
 404 lck_attr_setdefault(
 405         lck_attr_t      *attr)
 406 {
 407 #if __arm__ || __arm64__
 408         /* <rdar://problem/4404579>: Using LCK_ATTR_DEBUG here causes panic at boot time for arm */
 409         attr->lck_attr_val =  LCK_ATTR_NONE;
 410 #elif __i386__ || __x86_64__
 411 #if     !DEBUG
 412         if (LcksOpts & enaLkDeb)
 413                 attr->lck_attr_val =  LCK_ATTR_DEBUG;
 414         else
 415                 attr->lck_attr_val =  LCK_ATTR_NONE;
 416 #else
 417         attr->lck_attr_val =  LCK_ATTR_DEBUG;
 418 #endif  /* !DEBUG */
 419 #else
 420 #error Unknown architecture.
 421 #endif  /* __arm__ */
 422 }
 423
 424
 425 /*
 426  * Routine:     lck_attr_setdebug
 427  */
 428 void
 429 lck_attr_setdebug(
 430         lck_attr_t      *attr)
 431 {
 432         (void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_DEBUG);
 433 }
 434
 435 /*
 436  * Routine:     lck_attr_setdebug
 437  */
 438 void
 439 lck_attr_cleardebug(
 440         lck_attr_t      *attr)
 441 {
 442         (void)hw_atomic_and(&attr->lck_attr_val, ~LCK_ATTR_DEBUG);
 443 }
 444
 445
 446 /*
 447  * Routine:     lck_attr_rw_shared_priority
 448  */
 449 void
 450 lck_attr_rw_shared_priority(
 451         lck_attr_t      *attr)
 452 {
 453         (void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_RW_SHARED_PRIORITY);
 454 }
 455
 456
 457 /*
 458  * Routine:     lck_attr_free
 459  */
 460 void
 461 lck_attr_free(
 462         lck_attr_t      *attr)
 463 {
 464         kfree(attr, sizeof(lck_attr_t));
 465 }
 466
 467 /*
 468  * Routine:     hw_lock_init
 469  *
 470  *      Initialize a hardware lock.
 471  */
 472 void
 473 hw_lock_init(hw_lock_t lock)
 474 {
 475         ordered_store_hw(lock, 0);
 476 }
 477
 478 /*
 479  *      Routine: hw_lock_lock_contended
 480  *
 481  *      Spin until lock is acquired or timeout expires.
 482  *      timeout is in mach_absolute_time ticks. Called with
 483  *      preemption disabled.
 484  */
 485
 486 #if     __SMP__
 487 static unsigned int NOINLINE
 488 hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean_t do_panic)
 489 {
 490         uint64_t        end = 0;
 491         uintptr_t       holder = lock->lock_data;
 492         int             i;
 493
 494         if (timeout == 0)
 495                 timeout = LOCK_PANIC_TIMEOUT;
 496 #if CONFIG_DTRACE
 497         uint64_t begin;
 498         boolean_t dtrace_enabled = lockstat_probemap[LS_LCK_SPIN_LOCK_SPIN] != 0;
 499         if (__improbable(dtrace_enabled))
 500                 begin = mach_absolute_time();
 501 #endif
 502         for ( ; ; ) {
 503                 for (i = 0; i < LOCK_SNOOP_SPINS; i++) {
 504                         cpu_pause();
 505 #if (!__ARM_ENABLE_WFE_) || (LOCK_PRETEST)
 506                         holder = ordered_load_hw(lock);
 507                         if (holder != 0)
 508                                 continue;
 509 #endif
 510                         if (atomic_compare_exchange(&lock->lock_data, 0, data,
 511                             memory_order_acquire_smp, TRUE)) {
 512 #if CONFIG_DTRACE
 513                                 if (__improbable(dtrace_enabled)) {
 514                                         uint64_t spintime = mach_absolute_time() - begin;
 515                                         if (spintime > dtrace_spin_threshold)
 516                                                 LOCKSTAT_RECORD2(LS_LCK_SPIN_LOCK_SPIN, lock, spintime, dtrace_spin_threshold);
 517                                 }
 518 #endif
 519                                 return 1;
 520                         }
 521                 }
 522                 if (end == 0) {
 523                         end = ml_get_timebase() + timeout;
 524                 }
 525                 else if (ml_get_timebase() >= end)
 526                         break;
 527         }
 528         if (do_panic) {
 529                 // Capture the actual time spent blocked, which may be higher than the timeout
 530                 // if a misbehaving interrupt stole this thread's CPU time.
 531                 panic("Spinlock timeout after %llu ticks, %p = %lx",
 532                         (ml_get_timebase() - end + timeout), lock, holder);
 533         }
 534         return 0;
 535 }
 536 #endif  // __SMP__
 537
 538 /*
 539  *      Routine: hw_lock_lock
 540  *
 541  *      Acquire lock, spinning until it becomes available,
 542  *      return with preemption disabled.
 543  */
 544 void
 545 hw_lock_lock(hw_lock_t lock)
 546 {
 547         thread_t        thread;
 548         uintptr_t       state;
 549
 550         thread = current_thread();
 551         disable_preemption_for_thread(thread);
 552         state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
 553 #if     __SMP__
 554
 555 #if     LOCK_PRETEST
 556         if (ordered_load_hw(lock))
 557                 goto contended;
 558 #endif  // LOCK_PRETEST
 559         if (atomic_compare_exchange(&lock->lock_data, 0, state,
 560                                         memory_order_acquire_smp, TRUE)) {
 561                 goto end;
 562         }
 563 #if     LOCK_PRETEST
 564 contended:
 565 #endif  // LOCK_PRETEST
 566         hw_lock_lock_contended(lock, state, 0, TRUE);
 567 end:
 568 #else   // __SMP__
 569         if (lock->lock_data)
 570                 panic("Spinlock held %p", lock);
 571         lock->lock_data = state;
 572 #endif  // __SMP__
 573 #if CONFIG_DTRACE
 574         LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
 575 #endif
 576         return;
 577 }
 578
 579 /*
 580  *      Routine: hw_lock_to
 581  *
 582  *      Acquire lock, spinning until it becomes available or timeout.
 583  *      Timeout is in mach_absolute_time ticks, return with
 584  *      preemption disabled.
 585  */
 586 unsigned int
 587 hw_lock_to(hw_lock_t lock, uint64_t timeout)
 588 {
 589         thread_t        thread;
 590         uintptr_t       state;
 591         unsigned int success = 0;
 592
 593         thread = current_thread();
 594         disable_preemption_for_thread(thread);
 595         state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
 596 #if     __SMP__
 597
 598 #if     LOCK_PRETEST
 599         if (ordered_load_hw(lock))
 600                 goto contended;
 601 #endif  // LOCK_PRETEST
 602         if (atomic_compare_exchange(&lock->lock_data, 0, state,
 603                                         memory_order_acquire_smp, TRUE)) {
 604                 success = 1;
 605                 goto end;
 606         }
 607 #if     LOCK_PRETEST
 608 contended:
 609 #endif  // LOCK_PRETEST
 610         success = hw_lock_lock_contended(lock, state, timeout, FALSE);
 611 end:
 612 #else   // __SMP__
 613         (void)timeout;
 614         if (ordered_load_hw(lock) == 0) {
 615                 ordered_store_hw(lock, state);
 616                 success = 1;
 617         }
 618 #endif  // __SMP__
 619 #if CONFIG_DTRACE
 620         if (success)
 621                 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
 622 #endif
 623         return success;
 624 }
 625
 626 /*
 627  *      Routine: hw_lock_try
 628  *
 629  *      returns with preemption disabled on success.
 630  */
 631 unsigned int
 632 hw_lock_try(hw_lock_t lock)
 633 {
 634         thread_t        thread = current_thread();
 635         int             success = 0;
 636 #if     LOCK_TRY_DISABLE_INT
 637         long            intmask;
 638
 639         intmask = disable_interrupts();
 640 #else
 641         disable_preemption_for_thread(thread);
 642 #endif  // LOCK_TRY_DISABLE_INT
 643
 644 #if     __SMP__
 645 #if     LOCK_PRETEST
 646         if (ordered_load_hw(lock))
 647                 goto failed;
 648 #endif  // LOCK_PRETEST
 649         success = atomic_compare_exchange(&lock->lock_data, 0, LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK,
 650                                         memory_order_acquire_smp, FALSE);
 651 #else
 652         if (lock->lock_data == 0) {
 653                 lock->lock_data = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
 654                 success = 1;
 655         }
 656 #endif  // __SMP__
 657
 658 #if     LOCK_TRY_DISABLE_INT
 659         if (success)
 660                 disable_preemption_for_thread(thread);
 661 #if     LOCK_PRETEST
 662 failed:
 663 #endif  // LOCK_PRETEST
 664         restore_interrupts(intmask);
 665 #else
 666 #if     LOCK_PRETEST
 667 failed:
 668 #endif  // LOCK_PRETEST
 669         if (!success)
 670                 enable_preemption();
 671 #endif  // LOCK_TRY_DISABLE_INT
 672 #if CONFIG_DTRACE
 673         if (success)
 674                 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
 675 #endif
 676         return success;
 677 }
 678
 679 /*
 680  *      Routine: hw_lock_unlock
 681  *
 682  *      Unconditionally release lock, release preemption level.
 683  */
 684 void
 685 hw_lock_unlock(hw_lock_t lock)
 686 {
 687         __c11_atomic_store((_Atomic uintptr_t *)&lock->lock_data, 0, memory_order_release_smp);
 688 #if __arm__ || __arm64__
 689         // ARM tests are only for open-source exclusion
 690         set_event();
 691 #endif  // __arm__ || __arm64__
 692 #if     CONFIG_DTRACE
 693         LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, 0);
 694 #endif /* CONFIG_DTRACE */
 695         enable_preemption();
 696 }
 697
 698 /*
 699  *      Routine hw_lock_held, doesn't change preemption state.
 700  *      N.B.  Racy, of course.
 701  */
 702 unsigned int
 703 hw_lock_held(hw_lock_t lock)
 704 {
 705         return (ordered_load_hw(lock) != 0);
 706 }
 707
 708 /*
 709  * Routine:     lck_spin_sleep
 710  */
 711 wait_result_t
 712 lck_spin_sleep(
 713         lck_spin_t              *lck,
 714         lck_sleep_action_t      lck_sleep_action,
 715         event_t                 event,
 716         wait_interrupt_t        interruptible)
 717 {
 718         wait_result_t   res;
 719
 720         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
 721                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
 722
 723         res = assert_wait(event, interruptible);
 724         if (res == THREAD_WAITING) {
 725                 lck_spin_unlock(lck);
 726                 res = thread_block(THREAD_CONTINUE_NULL);
 727                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK))
 728                         lck_spin_lock(lck);
 729         }
 730         else
 731         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
 732                 lck_spin_unlock(lck);
 733
 734         return res;
 735 }
 736
 737
 738 /*
 739  * Routine:     lck_spin_sleep_deadline
 740  */
 741 wait_result_t
 742 lck_spin_sleep_deadline(
 743         lck_spin_t              *lck,
 744         lck_sleep_action_t      lck_sleep_action,
 745         event_t                 event,
 746         wait_interrupt_t        interruptible,
 747         uint64_t                deadline)
 748 {
 749         wait_result_t   res;
 750
 751         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
 752                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
 753
 754         res = assert_wait_deadline(event, interruptible, deadline);
 755         if (res == THREAD_WAITING) {
 756                 lck_spin_unlock(lck);
 757                 res = thread_block(THREAD_CONTINUE_NULL);
 758                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK))
 759                         lck_spin_lock(lck);
 760         }
 761         else
 762         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
 763                 lck_spin_unlock(lck);
 764
 765         return res;
 766 }
 767
 768
 769 /*
 770  * Routine:     lck_mtx_clear_promoted
 771  *
 772  * Handle clearing of TH_SFLAG_PROMOTED,
 773  * adjusting thread priority as needed.
 774  *
 775  * Called with thread lock held
 776  */
 777 static void
 778 lck_mtx_clear_promoted (
 779         thread_t                        thread,
 780         __kdebug_only uintptr_t         trace_lck)
 781 {
 782         thread->sched_flags &= ~TH_SFLAG_PROMOTED;
 783
 784         if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
 785                 /* Thread still has a RW lock promotion */
 786         } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
 787                 KERNEL_DEBUG_CONSTANT(
 788                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
 789                                 thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
 790                 set_sched_pri(thread, DEPRESSPRI);
 791         } else {
 792                 if (thread->base_pri < thread->sched_pri) {
 793                         KERNEL_DEBUG_CONSTANT(
 794                                 MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
 795                                         thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
 796                 }
 797                 thread_recompute_sched_pri(thread, FALSE);
 798         }
 799 }
 800
 801
 802 /*
 803  * Routine:     lck_mtx_sleep
 804  */
 805 wait_result_t
 806 lck_mtx_sleep(
 807         lck_mtx_t               *lck,
 808         lck_sleep_action_t      lck_sleep_action,
 809         event_t                 event,
 810         wait_interrupt_t        interruptible)
 811 {
 812         wait_result_t   res;
 813         thread_t                thread = current_thread();
 814
 815         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_CODE) | DBG_FUNC_START,
 816                      VM_KERNEL_UNSLIDE_OR_PERM(lck), (int)lck_sleep_action, VM_KERNEL_UNSLIDE_OR_PERM(event), (int)interruptible, 0);
 817
 818         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
 819                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
 820
 821         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
 822                 /*
 823                  * We overload the RW lock promotion to give us a priority ceiling
 824                  * during the time that this thread is asleep, so that when it
 825                  * is re-awakened (and not yet contending on the mutex), it is
 826                  * runnable at a reasonably high priority.
 827                  */
 828                 thread->rwlock_count++;
 829         }
 830
 831         res = assert_wait(event, interruptible);
 832         if (res == THREAD_WAITING) {
 833                 lck_mtx_unlock(lck);
 834                 res = thread_block(THREAD_CONTINUE_NULL);
 835                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
 836                         if ((lck_sleep_action & LCK_SLEEP_SPIN))
 837                                 lck_mtx_lock_spin(lck);
 838                         else if ((lck_sleep_action & LCK_SLEEP_SPIN_ALWAYS))
 839                                 lck_mtx_lock_spin_always(lck);
 840                         else
 841                                 lck_mtx_lock(lck);
 842                 }
 843         }
 844         else
 845         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
 846                 lck_mtx_unlock(lck);
 847
 848         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
 849                 if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
 850                         /* sched_flags checked without lock, but will be rechecked while clearing */
 851                         lck_rw_clear_promotion(thread);
 852                 }
 853         }
 854
 855         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0);
 856
 857         return res;
 858 }
 859
 860
 861 /*
 862  * Routine:     lck_mtx_sleep_deadline
 863  */
 864 wait_result_t
 865 lck_mtx_sleep_deadline(
 866         lck_mtx_t               *lck,
 867         lck_sleep_action_t      lck_sleep_action,
 868         event_t                 event,
 869         wait_interrupt_t        interruptible,
 870         uint64_t                deadline)
 871 {
 872         wait_result_t   res;
 873         thread_t                thread = current_thread();
 874
 875         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_START,
 876                      VM_KERNEL_UNSLIDE_OR_PERM(lck), (int)lck_sleep_action, VM_KERNEL_UNSLIDE_OR_PERM(event), (int)interruptible, 0);
 877
 878         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
 879                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
 880
 881         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
 882                 /*
 883                  * See lck_mtx_sleep().
 884                  */
 885                 thread->rwlock_count++;
 886         }
 887
 888         res = assert_wait_deadline(event, interruptible, deadline);
 889         if (res == THREAD_WAITING) {
 890                 lck_mtx_unlock(lck);
 891                 res = thread_block(THREAD_CONTINUE_NULL);
 892                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
 893                         if ((lck_sleep_action & LCK_SLEEP_SPIN))
 894                                 lck_mtx_lock_spin(lck);
 895                         else
 896                                 lck_mtx_lock(lck);
 897                 }
 898         }
 899         else
 900         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
 901                 lck_mtx_unlock(lck);
 902
 903         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
 904                 if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
 905                         /* sched_flags checked without lock, but will be rechecked while clearing */
 906                         lck_rw_clear_promotion(thread);
 907                 }
 908         }
 909
 910         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0);
 911
 912         return res;
 913 }
 914
 915 /*
 916  * Routine:     lck_mtx_lock_wait
 917  *
 918  * Invoked in order to wait on contention.
 919  *
 920  * Called with the interlock locked and
 921  * returns it unlocked.
 922  */
 923 void
 924 lck_mtx_lock_wait (
 925         lck_mtx_t                       *lck,
 926         thread_t                        holder)
 927 {
 928         thread_t                self = current_thread();
 929         lck_mtx_t               *mutex;
 930         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
 931         __kdebug_only uintptr_t trace_holder = VM_KERNEL_UNSLIDE_OR_PERM(holder);
 932         integer_t               priority;
 933         spl_t                   s = splsched();
 934 #if     CONFIG_DTRACE
 935         uint64_t                sleep_start = 0;
 936
 937         if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
 938                 sleep_start = mach_absolute_time();
 939         }
 940 #endif
 941
 942         if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
 943                 mutex = lck;
 944         else
 945                 mutex = &lck->lck_mtx_ptr->lck_mtx;
 946
 947         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, trace_lck, trace_holder, 0, 0, 0);
 948
 949         priority = self->sched_pri;
 950         if (priority < self->base_pri)
 951                 priority = self->base_pri;
 952         if (priority < BASEPRI_DEFAULT)
 953                 priority = BASEPRI_DEFAULT;
 954
 955         /* Do not promote past promotion ceiling */
 956         priority = MIN(priority, MAXPRI_PROMOTE);
 957
 958         thread_lock(holder);
 959         if (mutex->lck_mtx_pri == 0) {
 960                 holder->promotions++;
 961                 holder->sched_flags |= TH_SFLAG_PROMOTED;
 962         }
 963
 964         if (mutex->lck_mtx_pri < priority && holder->sched_pri < priority) {
 965                 KERNEL_DEBUG_CONSTANT(
 966                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
 967                                         holder->sched_pri, priority, trace_holder, trace_lck, 0);
 968                 set_sched_pri(holder, priority);
 969         }
 970         thread_unlock(holder);
 971         splx(s);
 972
 973         if (mutex->lck_mtx_pri < priority)
 974                 mutex->lck_mtx_pri = priority;
 975         if (self->pending_promoter[self->pending_promoter_index] == NULL) {
 976                 self->pending_promoter[self->pending_promoter_index] = mutex;
 977                 mutex->lck_mtx_waiters++;
 978         }
 979         else
 980         if (self->pending_promoter[self->pending_promoter_index] != mutex) {
 981                 self->pending_promoter[++self->pending_promoter_index] = mutex;
 982                 mutex->lck_mtx_waiters++;
 983         }
 984
 985         thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
 986         assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
 987         lck_mtx_ilk_unlock(mutex);
 988
 989         thread_block(THREAD_CONTINUE_NULL);
 990
 991         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
 992 #if     CONFIG_DTRACE
 993         /*
 994          * Record the DTrace lockstat probe for blocking, block time
 995          * measured from when we were entered.
 996          */
 997         if (sleep_start) {
 998                 if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) {
 999                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, lck,
1000                             mach_absolute_time() - sleep_start);
1001                 } else {
1002                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, lck,
1003                             mach_absolute_time() - sleep_start);
1004                 }
1005         }
1006 #endif
1007 }
1008
1009 /*
1010  * Routine:     lck_mtx_lock_acquire
1011  *
1012  * Invoked on acquiring the mutex when there is
1013  * contention.
1014  *
1015  * Returns the current number of waiters.
1016  *
1017  * Called with the interlock locked.
1018  */
1019 int
1020 lck_mtx_lock_acquire(
1021         lck_mtx_t               *lck)
1022 {
1023         thread_t                thread = current_thread();
1024         lck_mtx_t               *mutex;
1025         integer_t               priority;
1026         spl_t                   s;
1027         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1028
1029         if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
1030                 mutex = lck;
1031         else
1032                 mutex = &lck->lck_mtx_ptr->lck_mtx;
1033
1034         if (thread->pending_promoter[thread->pending_promoter_index] == mutex) {
1035                 thread->pending_promoter[thread->pending_promoter_index] = NULL;
1036                 if (thread->pending_promoter_index > 0)
1037                         thread->pending_promoter_index--;
1038                 mutex->lck_mtx_waiters--;
1039         }
1040
1041         if (mutex->lck_mtx_waiters)
1042                 priority = mutex->lck_mtx_pri;
1043         else {
1044                 mutex->lck_mtx_pri = 0;
1045                 priority = 0;
1046         }
1047
1048         if (priority || thread->was_promoted_on_wakeup) {
1049                 s = splsched();
1050                 thread_lock(thread);
1051
1052                 if (priority) {
1053                         thread->promotions++;
1054                         thread->sched_flags |= TH_SFLAG_PROMOTED;
1055                         if (thread->sched_pri < priority) {
1056                                 KERNEL_DEBUG_CONSTANT(
1057                                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
1058                                                         thread->sched_pri, priority, 0, trace_lck, 0);
1059                                 /* Do not promote past promotion ceiling */
1060                                 assert(priority <= MAXPRI_PROMOTE);
1061                                 set_sched_pri(thread, priority);
1062                         }
1063                 }
1064                 if (thread->was_promoted_on_wakeup) {
1065                         thread->was_promoted_on_wakeup = 0;
1066                         if (thread->promotions == 0)
1067                                 lck_mtx_clear_promoted(thread, trace_lck);
1068                 }
1069
1070                 thread_unlock(thread);
1071                 splx(s);
1072         }
1073
1074 #if CONFIG_DTRACE
1075         if (lockstat_probemap[LS_LCK_MTX_LOCK_ACQUIRE] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_ACQUIRE]) {
1076                 if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) {
1077                         LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lck, 0);
1078                 } else {
1079                         LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, lck, 0);
1080                 }
1081         }
1082 #endif
1083         return (mutex->lck_mtx_waiters);
1084 }
1085
1086 /*
1087  * Routine:     lck_mtx_unlock_wakeup
1088  *
1089  * Invoked on unlock when there is contention.
1090  *
1091  * Called with the interlock locked.
1092  */
1093 void
1094 lck_mtx_unlock_wakeup (
1095         lck_mtx_t                       *lck,
1096         thread_t                        holder)
1097 {
1098         thread_t                thread = current_thread();
1099         lck_mtx_t               *mutex;
1100         __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1101
1102         if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
1103                 mutex = lck;
1104         else
1105                 mutex = &lck->lck_mtx_ptr->lck_mtx;
1106
1107         if (thread != holder)
1108                 panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder);
1109
1110         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(holder), 0, 0, 0);
1111
1112         assert(mutex->lck_mtx_waiters > 0);
1113         if (mutex->lck_mtx_waiters > 1)
1114                 thread_wakeup_one_with_pri(LCK_MTX_EVENT(lck), lck->lck_mtx_pri);
1115         else
1116                 thread_wakeup_one(LCK_MTX_EVENT(lck));
1117
1118         if (thread->promotions > 0) {
1119                 spl_t           s = splsched();
1120
1121                 thread_lock(thread);
1122                 if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED))
1123                         lck_mtx_clear_promoted(thread, trace_lck);
1124                 thread_unlock(thread);
1125                 splx(s);
1126         }
1127
1128         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
1129 }
1130
1131 void
1132 lck_mtx_unlockspin_wakeup (
1133         lck_mtx_t                       *lck)
1134 {
1135         assert(lck->lck_mtx_waiters > 0);
1136         thread_wakeup_one(LCK_MTX_EVENT(lck));
1137
1138         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(lck), 0, 0, 1, 0);
1139 #if CONFIG_DTRACE
1140         /*
1141          * When there are waiters, we skip the hot-patch spot in the
1142          * fastpath, so we record it here.
1143          */
1144         LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lck, 0);
1145 #endif
1146 }
1147
1148
1149 /*
1150  * Routine:     mutex_pause
1151  *
1152  * Called by former callers of simple_lock_pause().
1153  */
1154 #define MAX_COLLISION_COUNTS    32
1155 #define MAX_COLLISION   8
1156
1157 unsigned int max_collision_count[MAX_COLLISION_COUNTS];
1158
1159 uint32_t collision_backoffs[MAX_COLLISION] = {
1160         10, 50, 100, 200, 400, 600, 800, 1000
1161 };
1162
1163
1164 void
1165 mutex_pause(uint32_t collisions)
1166 {
1167         wait_result_t wait_result;
1168         uint32_t        back_off;
1169
1170         if (collisions >= MAX_COLLISION_COUNTS)
1171                 collisions = MAX_COLLISION_COUNTS - 1;
1172         max_collision_count[collisions]++;
1173
1174         if (collisions >= MAX_COLLISION)
1175                 collisions = MAX_COLLISION - 1;
1176         back_off = collision_backoffs[collisions];
1177
1178         wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC);
1179         assert(wait_result == THREAD_WAITING);
1180
1181         wait_result = thread_block(THREAD_CONTINUE_NULL);
1182         assert(wait_result == THREAD_TIMED_OUT);
1183 }
1184
1185
1186 unsigned int mutex_yield_wait = 0;
1187 unsigned int mutex_yield_no_wait = 0;
1188
1189 void
1190 lck_mtx_yield(
1191             lck_mtx_t   *lck)
1192 {
1193         int     waiters;
1194
1195 #if DEBUG
1196         lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED);
1197 #endif /* DEBUG */
1198
1199         if (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT)
1200                 waiters = lck->lck_mtx_ptr->lck_mtx.lck_mtx_waiters;
1201         else
1202                 waiters = lck->lck_mtx_waiters;
1203
1204         if ( !waiters) {
1205                 mutex_yield_no_wait++;
1206         } else {
1207                 mutex_yield_wait++;
1208                 lck_mtx_unlock(lck);
1209                 mutex_pause(0);
1210                 lck_mtx_lock(lck);
1211         }
1212 }
1213
1214
1215 /*
1216  * Routine:     lck_rw_sleep
1217  */
1218 wait_result_t
1219 lck_rw_sleep(
1220         lck_rw_t                *lck,
1221         lck_sleep_action_t      lck_sleep_action,
1222         event_t                 event,
1223         wait_interrupt_t        interruptible)
1224 {
1225         wait_result_t   res;
1226         lck_rw_type_t   lck_rw_type;
1227         thread_t                thread = current_thread();
1228
1229         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
1230                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
1231
1232         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
1233                 /*
1234                  * Although we are dropping the RW lock, the intent in most cases
1235                  * is that this thread remains as an observer, since it may hold
1236                  * some secondary resource, but must yield to avoid deadlock. In
1237                  * this situation, make sure that the thread is boosted to the
1238                  * RW lock ceiling while blocked, so that it can re-acquire the
1239                  * RW lock at that priority.
1240                  */
1241                 thread->rwlock_count++;
1242         }
1243
1244         res = assert_wait(event, interruptible);
1245         if (res == THREAD_WAITING) {
1246                 lck_rw_type = lck_rw_done(lck);
1247                 res = thread_block(THREAD_CONTINUE_NULL);
1248                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
1249                         if (!(lck_sleep_action & (LCK_SLEEP_SHARED|LCK_SLEEP_EXCLUSIVE)))
1250                                 lck_rw_lock(lck, lck_rw_type);
1251                         else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE)
1252                                 lck_rw_lock_exclusive(lck);
1253                         else
1254                                 lck_rw_lock_shared(lck);
1255                 }
1256         }
1257         else
1258         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
1259                 (void)lck_rw_done(lck);
1260
1261         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
1262                 if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1263                         /* sched_flags checked without lock, but will be rechecked while clearing */
1264
1265                         /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */
1266                         assert(lck_sleep_action & LCK_SLEEP_UNLOCK);
1267
1268                         lck_rw_clear_promotion(thread);
1269                 }
1270         }
1271
1272         return res;
1273 }
1274
1275
1276 /*
1277  * Routine:     lck_rw_sleep_deadline
1278  */
1279 wait_result_t
1280 lck_rw_sleep_deadline(
1281         lck_rw_t                *lck,
1282         lck_sleep_action_t      lck_sleep_action,
1283         event_t                 event,
1284         wait_interrupt_t        interruptible,
1285         uint64_t                deadline)
1286 {
1287         wait_result_t   res;
1288         lck_rw_type_t   lck_rw_type;
1289         thread_t                thread = current_thread();
1290
1291         if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
1292                 panic("Invalid lock sleep action %x\n", lck_sleep_action);
1293
1294         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
1295                 thread->rwlock_count++;
1296         }
1297
1298         res = assert_wait_deadline(event, interruptible, deadline);
1299         if (res == THREAD_WAITING) {
1300                 lck_rw_type = lck_rw_done(lck);
1301                 res = thread_block(THREAD_CONTINUE_NULL);
1302                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
1303                         if (!(lck_sleep_action & (LCK_SLEEP_SHARED|LCK_SLEEP_EXCLUSIVE)))
1304                                 lck_rw_lock(lck, lck_rw_type);
1305                         else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE)
1306                                 lck_rw_lock_exclusive(lck);
1307                         else
1308                                 lck_rw_lock_shared(lck);
1309                 }
1310         }
1311         else
1312         if (lck_sleep_action & LCK_SLEEP_UNLOCK)
1313                 (void)lck_rw_done(lck);
1314
1315         if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
1316                 if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1317                         /* sched_flags checked without lock, but will be rechecked while clearing */
1318
1319                         /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */
1320                         assert(lck_sleep_action & LCK_SLEEP_UNLOCK);
1321
1322                         lck_rw_clear_promotion(thread);
1323                 }
1324         }
1325
1326         return res;
1327 }
1328
1329 /*
1330  * Reader-writer lock promotion
1331  *
1332  * We support a limited form of reader-writer
1333  * lock promotion whose effects are:
1334  *
1335  *   * Qualifying threads have decay disabled
1336  *   * Scheduler priority is reset to a floor of
1337  *     of their statically assigned priority
1338  *     or BASEPRI_BACKGROUND
1339  *
1340  * The rationale is that lck_rw_ts do not have
1341  * a single owner, so we cannot apply a directed
1342  * priority boost from all waiting threads
1343  * to all holding threads without maintaining
1344  * lists of all shared owners and all waiting
1345  * threads for every lock.
1346  *
1347  * Instead (and to preserve the uncontended fast-
1348  * path), acquiring (or attempting to acquire)
1349  * a RW lock in shared or exclusive lock increments
1350  * a per-thread counter. Only if that thread stops
1351  * making forward progress (for instance blocking
1352  * on a mutex, or being preempted) do we consult
1353  * the counter and apply the priority floor.
1354  * When the thread becomes runnable again (or in
1355  * the case of preemption it never stopped being
1356  * runnable), it has the priority boost and should
1357  * be in a good position to run on the CPU and
1358  * release all RW locks (at which point the priority
1359  * boost is cleared).
1360  *
1361  * Care must be taken to ensure that priority
1362  * boosts are not retained indefinitely, since unlike
1363  * mutex priority boosts (where the boost is tied
1364  * to the mutex lifecycle), the boost is tied
1365  * to the thread and independent of any particular
1366  * lck_rw_t. Assertions are in place on return
1367  * to userspace so that the boost is not held
1368  * indefinitely.
1369  *
1370  * The routines that increment/decrement the
1371  * per-thread counter should err on the side of
1372  * incrementing any time a preemption is possible
1373  * and the lock would be visible to the rest of the
1374  * system as held (so it should be incremented before
1375  * interlocks are dropped/preemption is enabled, or
1376  * before a CAS is executed to acquire the lock).
1377  *
1378  */
1379
1380 /*
1381  * lck_rw_clear_promotion: Undo priority promotions when the last RW
1382  * lock is released by a thread (if a promotion was active)
1383  */
1384 void lck_rw_clear_promotion(thread_t thread)
1385 {
1386         assert(thread->rwlock_count == 0);
1387
1388         /* Cancel any promotions if the thread had actually blocked while holding a RW lock */
1389         spl_t s = splsched();
1390
1391         thread_lock(thread);
1392
1393         if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
1394                 thread->sched_flags &= ~TH_SFLAG_RW_PROMOTED;
1395
1396                 if (thread->sched_flags & TH_SFLAG_PROMOTED) {
1397                         /* Thread still has a mutex promotion */
1398                 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1399                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
1400                                               (uintptr_t)thread_tid(thread), thread->sched_pri, DEPRESSPRI, 0, 0);
1401
1402                         set_sched_pri(thread, DEPRESSPRI);
1403                 } else {
1404                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
1405                                               (uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, 0, 0);
1406
1407                         thread_recompute_sched_pri(thread, FALSE);
1408                 }
1409         }
1410
1411         thread_unlock(thread);
1412         splx(s);
1413 }
1414
1415 /*
1416  * Callout from context switch if the thread goes
1417  * off core with a positive rwlock_count
1418  *
1419  * Called at splsched with the thread locked
1420  */
1421 void
1422 lck_rw_set_promotion_locked(thread_t thread)
1423 {
1424         if (LcksOpts & disLkRWPrio)
1425                 return;
1426
1427         integer_t priority;
1428
1429         priority = thread->sched_pri;
1430
1431         if (priority < thread->base_pri)
1432                 priority = thread->base_pri;
1433         if (priority < BASEPRI_BACKGROUND)
1434                 priority = BASEPRI_BACKGROUND;
1435
1436         if ((thread->sched_pri < priority) ||
1437             !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1438                 KERNEL_DEBUG_CONSTANT(
1439                         MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE,
1440                         (uintptr_t)thread_tid(thread), thread->sched_pri,
1441                         thread->base_pri, priority, 0);
1442
1443                 thread->sched_flags |= TH_SFLAG_RW_PROMOTED;
1444
1445                 if (thread->sched_pri < priority)
1446                         set_sched_pri(thread, priority);
1447         }
1448 }
1449
1450 kern_return_t
1451 host_lockgroup_info(
1452         host_t                                  host,
1453         lockgroup_info_array_t  *lockgroup_infop,
1454         mach_msg_type_number_t  *lockgroup_infoCntp)
1455 {
1456         lockgroup_info_t        *lockgroup_info_base;
1457         lockgroup_info_t        *lockgroup_info;
1458         vm_offset_t                     lockgroup_info_addr;
1459         vm_size_t                       lockgroup_info_size;
1460         vm_size_t                       lockgroup_info_vmsize;
1461         lck_grp_t                       *lck_grp;
1462         unsigned int            i;
1463         vm_map_copy_t           copy;
1464         kern_return_t           kr;
1465
1466         if (host == HOST_NULL)
1467                 return KERN_INVALID_HOST;
1468
1469         lck_mtx_lock(&lck_grp_lock);
1470
1471         lockgroup_info_size = lck_grp_cnt * sizeof(*lockgroup_info);
1472         lockgroup_info_vmsize = round_page(lockgroup_info_size);
1473         kr = kmem_alloc_pageable(ipc_kernel_map,
1474                                                  &lockgroup_info_addr, lockgroup_info_vmsize, VM_KERN_MEMORY_IPC);
1475         if (kr != KERN_SUCCESS) {
1476                 lck_mtx_unlock(&lck_grp_lock);
1477                 return(kr);
1478         }
1479
1480         lockgroup_info_base = (lockgroup_info_t *) lockgroup_info_addr;
1481         lck_grp = (lck_grp_t *)queue_first(&lck_grp_queue);
1482         lockgroup_info = lockgroup_info_base;
1483
1484         for (i = 0; i < lck_grp_cnt; i++) {
1485
1486                 lockgroup_info->lock_spin_cnt = lck_grp->lck_grp_spincnt;
1487                 lockgroup_info->lock_spin_util_cnt = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_util_cnt;
1488                 lockgroup_info->lock_spin_held_cnt = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_held_cnt;
1489                 lockgroup_info->lock_spin_miss_cnt = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_miss_cnt;
1490                 lockgroup_info->lock_spin_held_max = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_held_max;
1491                 lockgroup_info->lock_spin_held_cum = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_held_cum;
1492
1493                 lockgroup_info->lock_mtx_cnt = lck_grp->lck_grp_mtxcnt;
1494                 lockgroup_info->lock_mtx_util_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_util_cnt;
1495                 lockgroup_info->lock_mtx_held_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_cnt;
1496                 lockgroup_info->lock_mtx_miss_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_miss_cnt;
1497                 lockgroup_info->lock_mtx_wait_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_cnt;
1498                 lockgroup_info->lock_mtx_held_max = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_max;
1499                 lockgroup_info->lock_mtx_held_cum = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_cum;
1500                 lockgroup_info->lock_mtx_wait_max = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_max;
1501                 lockgroup_info->lock_mtx_wait_cum = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_cum;
1502
1503                 lockgroup_info->lock_rw_cnt = lck_grp->lck_grp_rwcnt;
1504                 lockgroup_info->lock_rw_util_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_util_cnt;
1505                 lockgroup_info->lock_rw_held_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_held_cnt;
1506                 lockgroup_info->lock_rw_miss_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_miss_cnt;
1507                 lockgroup_info->lock_rw_wait_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_cnt;
1508                 lockgroup_info->lock_rw_held_max = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_held_max;
1509                 lockgroup_info->lock_rw_held_cum = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_held_cum;
1510                 lockgroup_info->lock_rw_wait_max = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_max;
1511                 lockgroup_info->lock_rw_wait_cum = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_cum;
1512
1513                 (void) strncpy(lockgroup_info->lockgroup_name,lck_grp->lck_grp_name, LOCKGROUP_MAX_NAME);
1514
1515                 lck_grp = (lck_grp_t *)(queue_next((queue_entry_t)(lck_grp)));
1516                 lockgroup_info++;
1517         }
1518
1519         *lockgroup_infoCntp = lck_grp_cnt;
1520         lck_mtx_unlock(&lck_grp_lock);
1521
1522         if (lockgroup_info_size != lockgroup_info_vmsize)
1523                 bzero((char *)lockgroup_info, lockgroup_info_vmsize - lockgroup_info_size);
1524
1525         kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)lockgroup_info_addr,
1526                            (vm_map_size_t)lockgroup_info_size, TRUE, &copy);
1527         assert(kr == KERN_SUCCESS);
1528
1529         *lockgroup_infop = (lockgroup_info_t *) copy;
1530
1531         return(KERN_SUCCESS);
1532 }
1533
1534 /*
1535  * Atomic primitives, prototyped in kern/simple_lock.h
1536  * Noret versions are more efficient on some architectures
1537  */
1538
1539 uint32_t
1540 hw_atomic_add(volatile uint32_t *dest, uint32_t delt)
1541 {
1542         ALIGN_TEST(dest,uint32_t);
1543         return __c11_atomic_fetch_add(ATOMIC_CAST(uint32_t,dest), delt, memory_order_relaxed) + delt;
1544 }
1545
1546 uint32_t
1547 hw_atomic_sub(volatile uint32_t *dest, uint32_t delt)
1548 {
1549         ALIGN_TEST(dest,uint32_t);
1550         return __c11_atomic_fetch_sub(ATOMIC_CAST(uint32_t,dest), delt, memory_order_relaxed) - delt;
1551 }
1552
1553 uint32_t
1554 hw_atomic_or(volatile uint32_t *dest, uint32_t mask)
1555 {
1556         ALIGN_TEST(dest,uint32_t);
1557         return __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed) | mask;
1558 }
1559
1560 void
1561 hw_atomic_or_noret(volatile uint32_t *dest, uint32_t mask)
1562 {
1563         ALIGN_TEST(dest,uint32_t);
1564         __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed);
1565 }
1566
1567 uint32_t
1568 hw_atomic_and(volatile uint32_t *dest, uint32_t mask)
1569 {
1570         ALIGN_TEST(dest,uint32_t);
1571         return __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed) & mask;
1572 }
1573
1574 void
1575 hw_atomic_and_noret(volatile uint32_t *dest, uint32_t mask)
1576 {
1577         ALIGN_TEST(dest,uint32_t);
1578         __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed);
1579 }
1580
1581 uint32_t
1582 hw_compare_and_store(uint32_t oldval, uint32_t newval, volatile uint32_t *dest)
1583 {
1584         ALIGN_TEST(dest,uint32_t);
1585         return __c11_atomic_compare_exchange_strong(ATOMIC_CAST(uint32_t,dest), &oldval, newval,
1586                         memory_order_acq_rel_smp, memory_order_relaxed);
1587 }
1588