xnu-4570.1.46.tar.gz

[apple/xnu.git] / osfmk / kern / locks.c
diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c

index 4a498b21447ae40d82b456bd834039e4f79b4afe..25641b8beb409ef593dec5205996e71db2ae7f05 100644 (file)
--- a/osfmk/kern/locks.c
+++ b/osfmk/kern/locks.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -53,6 +53,10 @@
   * any improvements or extensions that they make and grant Carnegie Mellon
   * the rights to redistribute these changes.
   */
+
+#define ATOMIC_PRIVATE 1
+#define LOCK_PRIVATE 1
+
  #include <mach_ldebug.h>
  #include <debug.h>
  
@@ -67,6 +71,8 @@
  #include <kern/processor.h>
  #include <kern/sched_prim.h>
  #include <kern/debug.h>
+#include <machine/atomic.h>
+#include <machine/machine_cpu.h>
  #include <string.h>
  
  
@@ -87,6 +93,27 @@
  #define        LCK_MTX_LCK_WAIT_CODE           2
  #define        LCK_MTX_UNLCK_WAKEUP_CODE       3
  
+#if MACH_LDEBUG
+#define ALIGN_TEST(p,t) do{if((uintptr_t)p&(sizeof(t)-1)) __builtin_trap();}while(0)
+#else
+#define ALIGN_TEST(p,t) do{}while(0)
+#endif
+
+/* Silence the volatile to _Atomic cast warning */
+#define ATOMIC_CAST(t,p) ((_Atomic t*)(uintptr_t)(p))
+
+/* Enforce program order of loads and stores. */
+#define ordered_load(target, type) \
+               __c11_atomic_load((_Atomic type *)(target), memory_order_relaxed)
+#define ordered_store(target, type, value) \
+               __c11_atomic_store((_Atomic type *)(target), value, memory_order_relaxed)
+
+#define ordered_load_hw(lock)                  ordered_load(&(lock)->lock_data, uintptr_t)
+#define ordered_store_hw(lock, value)  ordered_store(&(lock)->lock_data, uintptr_t, (value))
+
+#define NOINLINE               __attribute__((noinline))
+
+
  static queue_head_t    lck_grp_queue;
  static unsigned int    lck_grp_cnt;
  
@@ -97,6 +124,14 @@ lck_grp_attr_t      LockDefaultGroupAttr;
  lck_grp_t              LockCompatGroup;
  lck_attr_t             LockDefaultLckAttr;
  
+#if CONFIG_DTRACE && __SMP__
+#if defined (__x86_64__)
+uint64_t dtrace_spin_threshold = 500; // 500ns
+#elif defined(__arm__) || defined(__arm64__)
+uint64_t dtrace_spin_threshold = LOCK_PANIC_TIMEOUT / 1000000; // 500ns
+#endif
+#endif
+
  /*
   * Routine:    lck_mod_init
   */
@@ -111,6 +146,12 @@ lck_mod_init(
         if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts)))
                 LcksOpts = 0;
  
+
+#if (DEVELOPMENT || DEBUG) && defined(__x86_64__)
+       if (!PE_parse_boot_argn("-disable_mtx_chk", &LckDisablePreemptCheck, sizeof (LckDisablePreemptCheck)))
+               LckDisablePreemptCheck = 0;
+#endif /* (DEVELOPMENT || DEBUG) && defined(__x86_64__) */
+
         queue_init(&lck_grp_queue);
         
         /* 
@@ -135,7 +176,6 @@ lck_mod_init(
         lck_attr_setdefault(&LockDefaultLckAttr);
         
         lck_mtx_init_ext(&lck_grp_lock, &lck_grp_lock_ext, &LockCompatGroup, &LockDefaultLckAttr);
-       
  }
  
  /*
@@ -218,6 +258,9 @@ lck_grp_alloc_init(
  void
  lck_grp_init(lck_grp_t * grp, const char * grp_name, lck_grp_attr_t * attr)
  {
+       /* make sure locking infrastructure has been initialized */
+       assert(lck_grp_cnt > 0);
+
         bzero((void *)grp, sizeof(lck_grp_t));
  
         (void)strlcpy(grp->lck_grp_name, grp_name, LCK_GRP_MAX_NAME);
@@ -315,6 +358,7 @@ lck_grp_lckcnt_decr(
         lck_type_t      lck_type)
  {
         unsigned int    *lckcnt;
+       int             updated;
  
         switch (lck_type) {
         case LCK_TYPE_SPIN:
@@ -327,10 +371,12 @@ lck_grp_lckcnt_decr(
                 lckcnt = &grp->lck_grp_rwcnt;
                 break;
         default:
-               return panic("lck_grp_lckcnt_decr(): invalid lock type: %d\n", lck_type);
+               panic("lck_grp_lckcnt_decr(): invalid lock type: %d\n", lck_type);
+               return;
         }
  
-       (void)hw_atomic_sub(lckcnt, 1);
+       updated = (int)hw_atomic_sub(lckcnt, 1);
+       assert(updated >= 0);
  }
  
  /*
@@ -358,7 +404,10 @@ void
  lck_attr_setdefault(
         lck_attr_t      *attr)
  {
-#if   __i386__ || __x86_64__
+#if __arm__ || __arm64__
+       /* <rdar://problem/4404579>: Using LCK_ATTR_DEBUG here causes panic at boot time for arm */
+       attr->lck_attr_val =  LCK_ATTR_NONE;
+#elif __i386__ || __x86_64__
  #if     !DEBUG
         if (LcksOpts & enaLkDeb)
                 attr->lck_attr_val =  LCK_ATTR_DEBUG;
@@ -415,6 +464,246 @@ lck_attr_free(
         kfree(attr, sizeof(lck_attr_t));
  }
  
+/*
+ * Routine:    hw_lock_init
+ *
+ *     Initialize a hardware lock.
+ */
+void
+hw_lock_init(hw_lock_t lock)
+{
+       ordered_store_hw(lock, 0);
+}
+
+/*
+ *     Routine: hw_lock_lock_contended
+ *
+ *     Spin until lock is acquired or timeout expires.
+ *     timeout is in mach_absolute_time ticks. Called with
+ *     preemption disabled.
+ */
+
+#if    __SMP__
+static unsigned int NOINLINE
+hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean_t do_panic)
+{
+       uint64_t        end = 0;
+       uintptr_t       holder = lock->lock_data;
+       int             i;
+
+       if (timeout == 0)
+               timeout = LOCK_PANIC_TIMEOUT;
+#if CONFIG_DTRACE
+       uint64_t begin;
+       boolean_t dtrace_enabled = lockstat_probemap[LS_LCK_SPIN_LOCK_SPIN] != 0;
+       if (__improbable(dtrace_enabled))
+               begin = mach_absolute_time();
+#endif
+       for ( ; ; ) {   
+               for (i = 0; i < LOCK_SNOOP_SPINS; i++) {
+                       cpu_pause();
+#if (!__ARM_ENABLE_WFE_) || (LOCK_PRETEST)
+                       holder = ordered_load_hw(lock);
+                       if (holder != 0)
+                               continue;
+#endif
+                       if (atomic_compare_exchange(&lock->lock_data, 0, data,
+                           memory_order_acquire_smp, TRUE)) {
+#if CONFIG_DTRACE
+                               if (__improbable(dtrace_enabled)) {
+                                       uint64_t spintime = mach_absolute_time() - begin;
+                                       if (spintime > dtrace_spin_threshold)
+                                               LOCKSTAT_RECORD2(LS_LCK_SPIN_LOCK_SPIN, lock, spintime, dtrace_spin_threshold);
+                               }
+#endif
+                               return 1;
+                       }
+               }
+               if (end == 0) {
+                       end = ml_get_timebase() + timeout;
+               }
+               else if (ml_get_timebase() >= end)
+                       break;
+       }
+       if (do_panic) {
+               // Capture the actual time spent blocked, which may be higher than the timeout
+               // if a misbehaving interrupt stole this thread's CPU time.
+               panic("Spinlock timeout after %llu ticks, %p = %lx",
+                       (ml_get_timebase() - end + timeout), lock, holder);
+       }
+       return 0;
+}
+#endif // __SMP__
+
+/*
+ *     Routine: hw_lock_lock
+ *
+ *     Acquire lock, spinning until it becomes available,
+ *     return with preemption disabled.
+ */
+void
+hw_lock_lock(hw_lock_t lock)
+{
+       thread_t        thread;
+       uintptr_t       state;
+
+       thread = current_thread();
+       disable_preemption_for_thread(thread);
+       state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
+#if    __SMP__
+
+#if    LOCK_PRETEST
+       if (ordered_load_hw(lock))
+               goto contended;
+#endif // LOCK_PRETEST
+       if (atomic_compare_exchange(&lock->lock_data, 0, state,
+                                       memory_order_acquire_smp, TRUE)) {
+               goto end;
+       }
+#if    LOCK_PRETEST
+contended:
+#endif // LOCK_PRETEST
+       hw_lock_lock_contended(lock, state, 0, TRUE);
+end:
+#else  // __SMP__
+       if (lock->lock_data)
+               panic("Spinlock held %p", lock);
+       lock->lock_data = state;
+#endif // __SMP__
+#if CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
+#endif
+       return;
+}
+
+/*
+ *     Routine: hw_lock_to
+ *
+ *     Acquire lock, spinning until it becomes available or timeout.
+ *     Timeout is in mach_absolute_time ticks, return with
+ *     preemption disabled.
+ */
+unsigned int
+hw_lock_to(hw_lock_t lock, uint64_t timeout)
+{
+       thread_t        thread;
+       uintptr_t       state;
+       unsigned int success = 0;
+
+       thread = current_thread();
+       disable_preemption_for_thread(thread);
+       state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
+#if    __SMP__
+
+#if    LOCK_PRETEST
+       if (ordered_load_hw(lock))
+               goto contended;
+#endif // LOCK_PRETEST
+       if (atomic_compare_exchange(&lock->lock_data, 0, state,
+                                       memory_order_acquire_smp, TRUE)) {
+               success = 1;
+               goto end;
+       }
+#if    LOCK_PRETEST
+contended:
+#endif // LOCK_PRETEST
+       success = hw_lock_lock_contended(lock, state, timeout, FALSE);
+end:
+#else  // __SMP__
+       (void)timeout;
+       if (ordered_load_hw(lock) == 0) {
+               ordered_store_hw(lock, state);
+               success = 1;
+       }
+#endif // __SMP__
+#if CONFIG_DTRACE
+       if (success)
+               LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
+#endif
+       return success;
+}
+
+/*
+ *     Routine: hw_lock_try
+ *
+ *     returns with preemption disabled on success.
+ */
+unsigned int
+hw_lock_try(hw_lock_t lock)
+{
+       thread_t        thread = current_thread();
+       int             success = 0;
+#if    LOCK_TRY_DISABLE_INT
+       long            intmask;
+
+       intmask = disable_interrupts();
+#else
+       disable_preemption_for_thread(thread);
+#endif // LOCK_TRY_DISABLE_INT
+
+#if    __SMP__
+#if    LOCK_PRETEST
+       if (ordered_load_hw(lock))
+               goto failed;
+#endif // LOCK_PRETEST
+       success = atomic_compare_exchange(&lock->lock_data, 0, LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK,
+                                       memory_order_acquire_smp, FALSE);
+#else
+       if (lock->lock_data == 0) {
+               lock->lock_data = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
+               success = 1;
+       }
+#endif // __SMP__
+
+#if    LOCK_TRY_DISABLE_INT
+       if (success)
+               disable_preemption_for_thread(thread);
+#if    LOCK_PRETEST
+failed:
+#endif // LOCK_PRETEST
+       restore_interrupts(intmask);
+#else
+#if    LOCK_PRETEST
+failed:
+#endif // LOCK_PRETEST
+       if (!success)
+               enable_preemption();
+#endif // LOCK_TRY_DISABLE_INT
+#if CONFIG_DTRACE
+       if (success)
+               LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
+#endif
+       return success;
+}
+
+/*
+ *     Routine: hw_lock_unlock
+ *
+ *     Unconditionally release lock, release preemption level.
+ */
+void
+hw_lock_unlock(hw_lock_t lock)
+{
+       __c11_atomic_store((_Atomic uintptr_t *)&lock->lock_data, 0, memory_order_release_smp);
+#if __arm__ || __arm64__
+       // ARM tests are only for open-source exclusion
+       set_event();
+#endif // __arm__ || __arm64__
+#if    CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, 0);
+#endif /* CONFIG_DTRACE */
+       enable_preemption();
+}
+
+/*
+ *     Routine hw_lock_held, doesn't change preemption state.
+ *     N.B.  Racy, of course.
+ */
+unsigned int
+hw_lock_held(hw_lock_t lock)
+{
+       return (ordered_load_hw(lock) != 0);
+}
  
  /*
   * Routine:    lck_spin_sleep
@@ -546,6 +835,8 @@ lck_mtx_sleep(
                 if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
                         if ((lck_sleep_action & LCK_SLEEP_SPIN))
                                 lck_mtx_lock_spin(lck);
+                       else if ((lck_sleep_action & LCK_SLEEP_SPIN_ALWAYS))
+                               lck_mtx_lock_spin_always(lck);
                         else
                                 lck_mtx_lock(lck);
                 }
@@ -665,9 +956,11 @@ lck_mtx_lock_wait (
         priority = MIN(priority, MAXPRI_PROMOTE);
  
         thread_lock(holder);
-       if (mutex->lck_mtx_pri == 0)
+       if (mutex->lck_mtx_pri == 0) {
                 holder->promotions++;
-       holder->sched_flags |= TH_SFLAG_PROMOTED;
+               holder->sched_flags |= TH_SFLAG_PROMOTED;
+       }
+
         if (mutex->lck_mtx_pri < priority && holder->sched_pri < priority) {
                 KERNEL_DEBUG_CONSTANT(
                         MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
@@ -689,6 +982,7 @@ lck_mtx_lock_wait (
                 mutex->lck_mtx_waiters++;
         }
  
+       thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
         assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
         lck_mtx_ilk_unlock(mutex);
  
@@ -697,7 +991,7 @@ lck_mtx_lock_wait (
         KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
  #if    CONFIG_DTRACE
         /*
-        * Record the Dtrace lockstat probe for blocking, block time
+        * Record the DTrace lockstat probe for blocking, block time
          * measured from when we were entered.
          */
         if (sleep_start) {
@@ -1103,13 +1397,13 @@ void lck_rw_clear_promotion(thread_t thread)
                         /* Thread still has a mutex promotion */
                 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
-                                                             thread->sched_pri, DEPRESSPRI, 0, 0, 0);
-                       
+                                             (uintptr_t)thread_tid(thread), thread->sched_pri, DEPRESSPRI, 0, 0);
+
                         set_sched_pri(thread, DEPRESSPRI);
                 } else {
                         KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
-                                                                 thread->sched_pri, thread->base_pri, 0, 0, 0);
-                       
+                                             (uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, 0, 0);
+
                         thread_recompute_sched_pri(thread, FALSE);
                 }
         }
@@ -1118,6 +1412,41 @@ void lck_rw_clear_promotion(thread_t thread)
         splx(s);
  }
  
+/*
+ * Callout from context switch if the thread goes
+ * off core with a positive rwlock_count
+ *
+ * Called at splsched with the thread locked
+ */
+void
+lck_rw_set_promotion_locked(thread_t thread)
+{
+       if (LcksOpts & disLkRWPrio)
+               return;
+
+       integer_t priority;
+
+       priority = thread->sched_pri;
+
+       if (priority < thread->base_pri)
+               priority = thread->base_pri;
+       if (priority < BASEPRI_BACKGROUND)
+               priority = BASEPRI_BACKGROUND;
+
+       if ((thread->sched_pri < priority) ||
+           !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
+               KERNEL_DEBUG_CONSTANT(
+                       MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE,
+                       (uintptr_t)thread_tid(thread), thread->sched_pri,
+                       thread->base_pri, priority, 0);
+
+               thread->sched_flags |= TH_SFLAG_RW_PROMOTED;
+
+               if (thread->sched_pri < priority)
+                       set_sched_pri(thread, priority);
+       }
+}
+
  kern_return_t
  host_lockgroup_info(
         host_t                                  host,
@@ -1128,9 +1457,9 @@ host_lockgroup_info(
         lockgroup_info_t        *lockgroup_info;
         vm_offset_t                     lockgroup_info_addr;
         vm_size_t                       lockgroup_info_size;
+       vm_size_t                       lockgroup_info_vmsize;
         lck_grp_t                       *lck_grp;
         unsigned int            i;
-       vm_size_t                       used;
         vm_map_copy_t           copy;
         kern_return_t           kr;
  
@@ -1139,9 +1468,10 @@ host_lockgroup_info(
  
         lck_mtx_lock(&lck_grp_lock);
  
-       lockgroup_info_size = round_page(lck_grp_cnt * sizeof *lockgroup_info);
+       lockgroup_info_size = lck_grp_cnt * sizeof(*lockgroup_info);
+       lockgroup_info_vmsize = round_page(lockgroup_info_size);
         kr = kmem_alloc_pageable(ipc_kernel_map,
-                                                &lockgroup_info_addr, lockgroup_info_size, VM_KERN_MEMORY_IPC);
+                                                &lockgroup_info_addr, lockgroup_info_vmsize, VM_KERN_MEMORY_IPC);
         if (kr != KERN_SUCCESS) {
                 lck_mtx_unlock(&lck_grp_lock);
                 return(kr);
@@ -1189,10 +1519,8 @@ host_lockgroup_info(
         *lockgroup_infoCntp = lck_grp_cnt;
         lck_mtx_unlock(&lck_grp_lock);
  
-       used = (*lockgroup_infoCntp) * sizeof *lockgroup_info;
-
-       if (used != lockgroup_info_size)
-               bzero((char *) lockgroup_info, lockgroup_info_size - used);
+       if (lockgroup_info_size != lockgroup_info_vmsize)
+               bzero((char *)lockgroup_info, lockgroup_info_vmsize - lockgroup_info_size);
  
         kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)lockgroup_info_addr,
                            (vm_map_size_t)lockgroup_info_size, TRUE, &copy);
@@ -1203,3 +1531,58 @@ host_lockgroup_info(
         return(KERN_SUCCESS);
  }
  
+/*
+ * Atomic primitives, prototyped in kern/simple_lock.h
+ * Noret versions are more efficient on some architectures
+ */
+       
+uint32_t
+hw_atomic_add(volatile uint32_t *dest, uint32_t delt)
+{
+       ALIGN_TEST(dest,uint32_t);
+       return __c11_atomic_fetch_add(ATOMIC_CAST(uint32_t,dest), delt, memory_order_relaxed) + delt;
+}
+
+uint32_t
+hw_atomic_sub(volatile uint32_t *dest, uint32_t delt)
+{
+       ALIGN_TEST(dest,uint32_t);
+       return __c11_atomic_fetch_sub(ATOMIC_CAST(uint32_t,dest), delt, memory_order_relaxed) - delt;
+}
+
+uint32_t
+hw_atomic_or(volatile uint32_t *dest, uint32_t mask)
+{
+       ALIGN_TEST(dest,uint32_t);
+       return __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed) | mask;
+}
+
+void
+hw_atomic_or_noret(volatile uint32_t *dest, uint32_t mask)
+{
+       ALIGN_TEST(dest,uint32_t);
+       __c11_atomic_fetch_or(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed);
+}
+
+uint32_t
+hw_atomic_and(volatile uint32_t *dest, uint32_t mask)
+{
+       ALIGN_TEST(dest,uint32_t);
+       return __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed) & mask;
+}
+
+void
+hw_atomic_and_noret(volatile uint32_t *dest, uint32_t mask)
+{
+       ALIGN_TEST(dest,uint32_t);
+       __c11_atomic_fetch_and(ATOMIC_CAST(uint32_t,dest), mask, memory_order_relaxed);
+}
+
+uint32_t
+hw_compare_and_store(uint32_t oldval, uint32_t newval, volatile uint32_t *dest)
+{
+       ALIGN_TEST(dest,uint32_t);
+       return __c11_atomic_compare_exchange_strong(ATOMIC_CAST(uint32_t,dest), &oldval, newval,
+                       memory_order_acq_rel_smp, memory_order_relaxed);
+}
+