X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/0c530ab8987f0ae6a1a3d9284f40182b88852816..c7d2c2c6ee645e10cbccdd01c6191873ec77239d:/osfmk/kern/locks.c diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c index db2db8142..2f782cd12 100644 --- a/osfmk/kern/locks.c +++ b/osfmk/kern/locks.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2007 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ @@ -47,7 +53,6 @@ * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ -#include #include #include @@ -67,20 +72,30 @@ #include +#if CONFIG_DTRACE +/* + * We need only enough declarations from the BSD-side to be able to + * test if our probe is active, and to call __dtrace_probe(). Setting + * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in. + */ +#define NEED_DTRACE_DEFS +#include <../bsd/sys/lockstat.h> +#endif + #define LCK_MTX_SLEEP_CODE 0 #define LCK_MTX_SLEEP_DEADLINE_CODE 1 #define LCK_MTX_LCK_WAIT_CODE 2 #define LCK_MTX_UNLCK_WAKEUP_CODE 3 - static queue_head_t lck_grp_queue; static unsigned int lck_grp_cnt; -decl_mutex_data(static,lck_grp_lock) +decl_lck_mtx_data(static,lck_grp_lock) +static lck_mtx_ext_t lck_grp_lock_ext; lck_grp_attr_t LockDefaultGroupAttr; -lck_grp_t LockCompatGroup; -lck_attr_t LockDefaultLckAttr; +lck_grp_t LockCompatGroup; +lck_attr_t LockDefaultLckAttr; /* * Routine: lck_mod_init @@ -90,12 +105,37 @@ void lck_mod_init( void) { + /* + * Obtain "lcks" options:this currently controls lock statistics + */ + if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts))) + LcksOpts = 0; + queue_init(&lck_grp_queue); - mutex_init(&lck_grp_lock, 0); - lck_grp_cnt = 0; - lck_grp_attr_setdefault( &LockDefaultGroupAttr); - lck_grp_init( &LockCompatGroup, "Compatibility APIs", LCK_GRP_ATTR_NULL); + + /* + * Need to bootstrap the LockCompatGroup instead of calling lck_grp_init() here. This avoids + * grabbing the lck_grp_lock before it is initialized. + */ + + bzero(&LockCompatGroup, sizeof(lck_grp_t)); + (void) strncpy(LockCompatGroup.lck_grp_name, "Compatibility APIs", LCK_GRP_MAX_NAME); + + if (LcksOpts & enaLkStat) + LockCompatGroup.lck_grp_attr = LCK_GRP_ATTR_STAT; + else + LockCompatGroup.lck_grp_attr = LCK_ATTR_NONE; + + LockCompatGroup.lck_grp_refcnt = 1; + + enqueue_tail(&lck_grp_queue, (queue_entry_t)&LockCompatGroup); + lck_grp_cnt = 1; + + lck_grp_attr_setdefault(&LockDefaultGroupAttr); lck_attr_setdefault(&LockDefaultLckAttr); + + lck_mtx_init_ext(&lck_grp_lock, &lck_grp_lock_ext, &LockCompatGroup, &LockDefaultLckAttr); + } /* @@ -138,7 +178,7 @@ void lck_grp_attr_setstat( lck_grp_attr_t *attr) { - (void)hw_atomic_or((uint32_t *)&attr->grp_attr_val, LCK_GRP_ATTR_STAT); + (void)hw_atomic_or(&attr->grp_attr_val, LCK_GRP_ATTR_STAT); } @@ -155,7 +195,7 @@ lck_grp_attr_free( /* - * Routine: lck_grp_alloc_init + * Routine: lck_grp_alloc_init */ lck_grp_t * @@ -171,38 +211,32 @@ lck_grp_alloc_init( return(grp); } - /* - * Routine: lck_grp_init + * Routine: lck_grp_init */ void -lck_grp_init( - lck_grp_t *grp, - const char* grp_name, - lck_grp_attr_t *attr) +lck_grp_init(lck_grp_t * grp, const char * grp_name, lck_grp_attr_t * attr) { bzero((void *)grp, sizeof(lck_grp_t)); - (void) strncpy(grp->lck_grp_name, grp_name, LCK_GRP_MAX_NAME); + (void)strlcpy(grp->lck_grp_name, grp_name, LCK_GRP_MAX_NAME); if (attr != LCK_GRP_ATTR_NULL) grp->lck_grp_attr = attr->grp_attr_val; else if (LcksOpts & enaLkStat) - grp->lck_grp_attr = LCK_GRP_ATTR_STAT; - else - grp->lck_grp_attr = LCK_ATTR_NONE; + grp->lck_grp_attr = LCK_GRP_ATTR_STAT; + else + grp->lck_grp_attr = LCK_ATTR_NONE; grp->lck_grp_refcnt = 1; - mutex_lock(&lck_grp_lock); + lck_mtx_lock(&lck_grp_lock); enqueue_tail(&lck_grp_queue, (queue_entry_t)grp); lck_grp_cnt++; - mutex_unlock(&lck_grp_lock); - + lck_mtx_unlock(&lck_grp_lock); } - /* * Routine: lck_grp_free */ @@ -211,10 +245,10 @@ void lck_grp_free( lck_grp_t *grp) { - mutex_lock(&lck_grp_lock); + lck_mtx_lock(&lck_grp_lock); lck_grp_cnt--; (void)remque((queue_entry_t)grp); - mutex_unlock(&lck_grp_lock); + lck_mtx_unlock(&lck_grp_lock); lck_grp_deallocate(grp); } @@ -227,7 +261,7 @@ void lck_grp_reference( lck_grp_t *grp) { - (void)hw_atomic_add((uint32_t *)(&grp->lck_grp_refcnt), 1); + (void)hw_atomic_add(&grp->lck_grp_refcnt, 1); } @@ -239,7 +273,7 @@ void lck_grp_deallocate( lck_grp_t *grp) { - if (hw_atomic_sub((uint32_t *)(&grp->lck_grp_refcnt), 1) == 0) + if (hw_atomic_sub(&grp->lck_grp_refcnt, 1) == 0) kfree(grp, sizeof(lck_grp_t)); } @@ -268,7 +302,7 @@ lck_grp_lckcnt_incr( return panic("lck_grp_lckcnt_incr(): invalid lock type: %d\n", lck_type); } - (void)hw_atomic_add((uint32_t *)lckcnt, 1); + (void)hw_atomic_add(lckcnt, 1); } /* @@ -296,7 +330,7 @@ lck_grp_lckcnt_decr( return panic("lck_grp_lckcnt_decr(): invalid lock type: %d\n", lck_type); } - (void)hw_atomic_sub((uint32_t *)lckcnt, 1); + (void)hw_atomic_sub(lckcnt, 1); } /* @@ -324,15 +358,18 @@ void lck_attr_setdefault( lck_attr_t *attr) { +#if __i386__ || __x86_64__ #if !DEBUG - if (LcksOpts & enaLkDeb) - attr->lck_attr_val = LCK_ATTR_DEBUG; - else - attr->lck_attr_val = LCK_ATTR_NONE; + if (LcksOpts & enaLkDeb) + attr->lck_attr_val = LCK_ATTR_DEBUG; + else + attr->lck_attr_val = LCK_ATTR_NONE; #else - attr->lck_attr_val = LCK_ATTR_DEBUG; -#endif - + attr->lck_attr_val = LCK_ATTR_DEBUG; +#endif /* !DEBUG */ +#else +#error Unknown architecture. +#endif /* __arm__ */ } @@ -343,7 +380,17 @@ void lck_attr_setdebug( lck_attr_t *attr) { - (void)hw_atomic_or((uint32_t *)&attr->lck_attr_val, LCK_ATTR_DEBUG); + (void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_DEBUG); +} + +/* + * Routine: lck_attr_setdebug + */ +void +lck_attr_cleardebug( + lck_attr_t *attr) +{ + (void)hw_atomic_and(&attr->lck_attr_val, ~LCK_ATTR_DEBUG); } @@ -354,7 +401,7 @@ void lck_attr_rw_shared_priority( lck_attr_t *attr) { - (void)hw_atomic_or((uint32_t *)&attr->lck_attr_val, LCK_ATTR_RW_SHARED_PRIORITY); + (void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_RW_SHARED_PRIORITY); } @@ -430,6 +477,39 @@ lck_spin_sleep_deadline( } +/* + * Routine: lck_mtx_clear_promoted + * + * Handle clearing of TH_SFLAG_PROMOTED, + * adjusting thread priority as needed. + * + * Called with thread lock held + */ +static void +lck_mtx_clear_promoted ( + thread_t thread, + __kdebug_only uintptr_t trace_lck) +{ + thread->sched_flags &= ~TH_SFLAG_PROMOTED; + + if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) { + /* Thread still has a RW lock promotion */ + } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE, + thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0); + set_sched_pri(thread, DEPRESSPRI); + } else { + if (thread->base_pri < thread->sched_pri) { + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE, + thread->sched_pri, thread->base_pri, 0, trace_lck, 0); + } + thread_recompute_sched_pri(thread, FALSE); + } +} + + /* * Routine: lck_mtx_sleep */ @@ -441,24 +521,46 @@ lck_mtx_sleep( wait_interrupt_t interruptible) { wait_result_t res; + thread_t thread = current_thread(); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_CODE) | DBG_FUNC_START, - (int)lck, (int)lck_sleep_action, (int)event, (int)interruptible, 0); + VM_KERNEL_UNSLIDE_OR_PERM(lck), (int)lck_sleep_action, VM_KERNEL_UNSLIDE_OR_PERM(event), (int)interruptible, 0); if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) panic("Invalid lock sleep action %x\n", lck_sleep_action); + if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { + /* + * We overload the RW lock promotion to give us a priority ceiling + * during the time that this thread is asleep, so that when it + * is re-awakened (and not yet contending on the mutex), it is + * runnable at a reasonably high priority. + */ + thread->rwlock_count++; + } + res = assert_wait(event, interruptible); if (res == THREAD_WAITING) { lck_mtx_unlock(lck); res = thread_block(THREAD_CONTINUE_NULL); - if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) - lck_mtx_lock(lck); + if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) { + if ((lck_sleep_action & LCK_SLEEP_SPIN)) + lck_mtx_lock_spin(lck); + else + lck_mtx_lock(lck); + } } else if (lck_sleep_action & LCK_SLEEP_UNLOCK) lck_mtx_unlock(lck); + if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { + if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + /* sched_flags checked without lock, but will be rechecked while clearing */ + lck_rw_clear_promotion(thread); + } + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0); return res; @@ -477,24 +579,43 @@ lck_mtx_sleep_deadline( uint64_t deadline) { wait_result_t res; + thread_t thread = current_thread(); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_START, - (int)lck, (int)lck_sleep_action, (int)event, (int)interruptible, 0); + VM_KERNEL_UNSLIDE_OR_PERM(lck), (int)lck_sleep_action, VM_KERNEL_UNSLIDE_OR_PERM(event), (int)interruptible, 0); if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) panic("Invalid lock sleep action %x\n", lck_sleep_action); + if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { + /* + * See lck_mtx_sleep(). + */ + thread->rwlock_count++; + } + res = assert_wait_deadline(event, interruptible, deadline); if (res == THREAD_WAITING) { lck_mtx_unlock(lck); res = thread_block(THREAD_CONTINUE_NULL); - if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) - lck_mtx_lock(lck); + if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) { + if ((lck_sleep_action & LCK_SLEEP_SPIN)) + lck_mtx_lock_spin(lck); + else + lck_mtx_lock(lck); + } } else if (lck_sleep_action & LCK_SLEEP_UNLOCK) lck_mtx_unlock(lck); + if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { + if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + /* sched_flags checked without lock, but will be rechecked while clearing */ + lck_rw_clear_promotion(thread); + } + } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0); return res; @@ -515,38 +636,43 @@ lck_mtx_lock_wait ( { thread_t self = current_thread(); lck_mtx_t *mutex; + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); + __kdebug_only uintptr_t trace_holder = VM_KERNEL_UNSLIDE_OR_PERM(holder); integer_t priority; spl_t s = splsched(); +#if CONFIG_DTRACE + uint64_t sleep_start = 0; + + if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) { + sleep_start = mach_absolute_time(); + } +#endif if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) mutex = lck; else mutex = &lck->lck_mtx_ptr->lck_mtx; - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, (int)lck, (int)holder, 0, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, trace_lck, trace_holder, 0, 0, 0); priority = self->sched_pri; - if (priority < self->priority) - priority = self->priority; - if (priority > MINPRI_KERNEL) - priority = MINPRI_KERNEL; - else + if (priority < self->base_pri) + priority = self->base_pri; if (priority < BASEPRI_DEFAULT) priority = BASEPRI_DEFAULT; + /* Do not promote past promotion ceiling */ + priority = MIN(priority, MAXPRI_PROMOTE); + thread_lock(holder); if (mutex->lck_mtx_pri == 0) holder->promotions++; - if (holder->priority < MINPRI_KERNEL) { - holder->sched_mode |= TH_MODE_PROMOTED; - if ( mutex->lck_mtx_pri < priority && - holder->sched_pri < priority ) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, - holder->sched_pri, priority, (int)holder, (int)lck, 0); - - set_sched_pri(holder, priority); - } + holder->sched_flags |= TH_SFLAG_PROMOTED; + if (mutex->lck_mtx_pri < priority && holder->sched_pri < priority) { + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, + holder->sched_pri, priority, trace_holder, trace_lck, 0); + set_sched_pri(holder, priority); } thread_unlock(holder); splx(s); @@ -563,12 +689,27 @@ lck_mtx_lock_wait ( mutex->lck_mtx_waiters++; } - assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT); + assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT); lck_mtx_ilk_unlock(mutex); thread_block(THREAD_CONTINUE_NULL); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); +#if CONFIG_DTRACE + /* + * Record the Dtrace lockstat probe for blocking, block time + * measured from when we were entered. + */ + if (sleep_start) { + if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) { + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, lck, + mach_absolute_time() - sleep_start); + } else { + LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, lck, + mach_absolute_time() - sleep_start); + } + } +#endif } /* @@ -587,6 +728,9 @@ lck_mtx_lock_acquire( { thread_t thread = current_thread(); lck_mtx_t *mutex; + integer_t priority; + spl_t s; + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) mutex = lck; @@ -600,28 +744,48 @@ lck_mtx_lock_acquire( mutex->lck_mtx_waiters--; } - if (mutex->lck_mtx_waiters > 0) { - integer_t priority = mutex->lck_mtx_pri; - spl_t s = splsched(); + if (mutex->lck_mtx_waiters) + priority = mutex->lck_mtx_pri; + else { + mutex->lck_mtx_pri = 0; + priority = 0; + } + if (priority || thread->was_promoted_on_wakeup) { + s = splsched(); thread_lock(thread); - thread->promotions++; - if (thread->priority < MINPRI_KERNEL) { - thread->sched_mode |= TH_MODE_PROMOTED; + + if (priority) { + thread->promotions++; + thread->sched_flags |= TH_SFLAG_PROMOTED; if (thread->sched_pri < priority) { KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, - thread->sched_pri, priority, 0, (int)lck, 0); - + thread->sched_pri, priority, 0, trace_lck, 0); + /* Do not promote past promotion ceiling */ + assert(priority <= MAXPRI_PROMOTE); set_sched_pri(thread, priority); } } + if (thread->was_promoted_on_wakeup) { + thread->was_promoted_on_wakeup = 0; + if (thread->promotions == 0) + lck_mtx_clear_promoted(thread, trace_lck); + } + thread_unlock(thread); splx(s); } - else - mutex->lck_mtx_pri = 0; +#if CONFIG_DTRACE + if (lockstat_probemap[LS_LCK_MTX_LOCK_ACQUIRE] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_ACQUIRE]) { + if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) { + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lck, 0); + } else { + LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, lck, 0); + } + } +#endif return (mutex->lck_mtx_waiters); } @@ -639,71 +803,121 @@ lck_mtx_unlock_wakeup ( { thread_t thread = current_thread(); lck_mtx_t *mutex; + __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) mutex = lck; else mutex = &lck->lck_mtx_ptr->lck_mtx; + if (thread != holder) + panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, (int)lck, (int)holder, 0, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(holder), 0, 0, 0); - if (thread != holder) - panic("lck_mtx_unlock_wakeup: mutex %x holder %x\n", mutex, holder); + assert(mutex->lck_mtx_waiters > 0); + if (mutex->lck_mtx_waiters > 1) + thread_wakeup_one_with_pri(LCK_MTX_EVENT(lck), lck->lck_mtx_pri); + else + thread_wakeup_one(LCK_MTX_EVENT(lck)); if (thread->promotions > 0) { spl_t s = splsched(); thread_lock(thread); - if ( --thread->promotions == 0 && - (thread->sched_mode & TH_MODE_PROMOTED) ) { - thread->sched_mode &= ~TH_MODE_PROMOTED; - if (thread->sched_mode & TH_MODE_ISDEPRESSED) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE, - thread->sched_pri, DEPRESSPRI, 0, (int)lck, 0); - - set_sched_pri(thread, DEPRESSPRI); - } - else { - if (thread->priority < thread->sched_pri) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | - DBG_FUNC_NONE, - thread->sched_pri, thread->priority, - 0, (int)lck, 0); - } - - compute_priority(thread, FALSE); - } - } + if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) + lck_mtx_clear_promoted(thread, trace_lck); thread_unlock(thread); splx(s); } - assert(mutex->lck_mtx_waiters > 0); - thread_wakeup_one((event_t)(((unsigned int*)lck)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int))); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); } +void +lck_mtx_unlockspin_wakeup ( + lck_mtx_t *lck) +{ + assert(lck->lck_mtx_waiters > 0); + thread_wakeup_one(LCK_MTX_EVENT(lck)); + + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(lck), 0, 0, 1, 0); +#if CONFIG_DTRACE + /* + * When there are waiters, we skip the hot-patch spot in the + * fastpath, so we record it here. + */ + LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lck, 0); +#endif +} + + /* * Routine: mutex_pause * * Called by former callers of simple_lock_pause(). */ +#define MAX_COLLISION_COUNTS 32 +#define MAX_COLLISION 8 + +unsigned int max_collision_count[MAX_COLLISION_COUNTS]; + +uint32_t collision_backoffs[MAX_COLLISION] = { + 10, 50, 100, 200, 400, 600, 800, 1000 +}; + void -mutex_pause(void) +mutex_pause(uint32_t collisions) { wait_result_t wait_result; + uint32_t back_off; + + if (collisions >= MAX_COLLISION_COUNTS) + collisions = MAX_COLLISION_COUNTS - 1; + max_collision_count[collisions]++; + + if (collisions >= MAX_COLLISION) + collisions = MAX_COLLISION - 1; + back_off = collision_backoffs[collisions]; - wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, 1, 1000*NSEC_PER_USEC); + wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC); assert(wait_result == THREAD_WAITING); wait_result = thread_block(THREAD_CONTINUE_NULL); assert(wait_result == THREAD_TIMED_OUT); } + +unsigned int mutex_yield_wait = 0; +unsigned int mutex_yield_no_wait = 0; + +void +lck_mtx_yield( + lck_mtx_t *lck) +{ + int waiters; + +#if DEBUG + lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED); +#endif /* DEBUG */ + + if (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT) + waiters = lck->lck_mtx_ptr->lck_mtx.lck_mtx_waiters; + else + waiters = lck->lck_mtx_waiters; + + if ( !waiters) { + mutex_yield_no_wait++; + } else { + mutex_yield_wait++; + lck_mtx_unlock(lck); + mutex_pause(0); + lck_mtx_lock(lck); + } +} + + /* * Routine: lck_rw_sleep */ @@ -716,10 +930,23 @@ lck_rw_sleep( { wait_result_t res; lck_rw_type_t lck_rw_type; - + thread_t thread = current_thread(); + if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) panic("Invalid lock sleep action %x\n", lck_sleep_action); + if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { + /* + * Although we are dropping the RW lock, the intent in most cases + * is that this thread remains as an observer, since it may hold + * some secondary resource, but must yield to avoid deadlock. In + * this situation, make sure that the thread is boosted to the + * RW lock ceiling while blocked, so that it can re-acquire the + * RW lock at that priority. + */ + thread->rwlock_count++; + } + res = assert_wait(event, interruptible); if (res == THREAD_WAITING) { lck_rw_type = lck_rw_done(lck); @@ -737,6 +964,17 @@ lck_rw_sleep( if (lck_sleep_action & LCK_SLEEP_UNLOCK) (void)lck_rw_done(lck); + if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { + if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + /* sched_flags checked without lock, but will be rechecked while clearing */ + + /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */ + assert(lck_sleep_action & LCK_SLEEP_UNLOCK); + + lck_rw_clear_promotion(thread); + } + } + return res; } @@ -754,10 +992,15 @@ lck_rw_sleep_deadline( { wait_result_t res; lck_rw_type_t lck_rw_type; + thread_t thread = current_thread(); if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) panic("Invalid lock sleep action %x\n", lck_sleep_action); + if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { + thread->rwlock_count++; + } + res = assert_wait_deadline(event, interruptible, deadline); if (res == THREAD_WAITING) { lck_rw_type = lck_rw_done(lck); @@ -775,9 +1018,106 @@ lck_rw_sleep_deadline( if (lck_sleep_action & LCK_SLEEP_UNLOCK) (void)lck_rw_done(lck); + if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { + if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { + /* sched_flags checked without lock, but will be rechecked while clearing */ + + /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */ + assert(lck_sleep_action & LCK_SLEEP_UNLOCK); + + lck_rw_clear_promotion(thread); + } + } + return res; } +/* + * Reader-writer lock promotion + * + * We support a limited form of reader-writer + * lock promotion whose effects are: + * + * * Qualifying threads have decay disabled + * * Scheduler priority is reset to a floor of + * of their statically assigned priority + * or BASEPRI_BACKGROUND + * + * The rationale is that lck_rw_ts do not have + * a single owner, so we cannot apply a directed + * priority boost from all waiting threads + * to all holding threads without maintaining + * lists of all shared owners and all waiting + * threads for every lock. + * + * Instead (and to preserve the uncontended fast- + * path), acquiring (or attempting to acquire) + * a RW lock in shared or exclusive lock increments + * a per-thread counter. Only if that thread stops + * making forward progress (for instance blocking + * on a mutex, or being preempted) do we consult + * the counter and apply the priority floor. + * When the thread becomes runnable again (or in + * the case of preemption it never stopped being + * runnable), it has the priority boost and should + * be in a good position to run on the CPU and + * release all RW locks (at which point the priority + * boost is cleared). + * + * Care must be taken to ensure that priority + * boosts are not retained indefinitely, since unlike + * mutex priority boosts (where the boost is tied + * to the mutex lifecycle), the boost is tied + * to the thread and independent of any particular + * lck_rw_t. Assertions are in place on return + * to userspace so that the boost is not held + * indefinitely. + * + * The routines that increment/decrement the + * per-thread counter should err on the side of + * incrementing any time a preemption is possible + * and the lock would be visible to the rest of the + * system as held (so it should be incremented before + * interlocks are dropped/preemption is enabled, or + * before a CAS is executed to acquire the lock). + * + */ + +/* + * lck_rw_clear_promotion: Undo priority promotions when the last RW + * lock is released by a thread (if a promotion was active) + */ +void lck_rw_clear_promotion(thread_t thread) +{ + assert(thread->rwlock_count == 0); + + /* Cancel any promotions if the thread had actually blocked while holding a RW lock */ + spl_t s = splsched(); + + thread_lock(thread); + + if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) { + thread->sched_flags &= ~TH_SFLAG_RW_PROMOTED; + + if (thread->sched_flags & TH_SFLAG_PROMOTED) { + /* Thread still has a mutex promotion */ + } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE, + thread->sched_pri, DEPRESSPRI, 0, 0, 0); + + set_sched_pri(thread, DEPRESSPRI); + } else { + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE, + thread->sched_pri, thread->base_pri, 0, 0, 0); + + thread_recompute_sched_pri(thread, FALSE); + } + } + + thread_unlock(thread); + splx(s); +} + kern_return_t host_lockgroup_info( host_t host, @@ -788,22 +1128,23 @@ host_lockgroup_info( lockgroup_info_t *lockgroup_info; vm_offset_t lockgroup_info_addr; vm_size_t lockgroup_info_size; + vm_size_t lockgroup_info_vmsize; lck_grp_t *lck_grp; unsigned int i; - vm_size_t used; vm_map_copy_t copy; kern_return_t kr; if (host == HOST_NULL) return KERN_INVALID_HOST; - mutex_lock(&lck_grp_lock); + lck_mtx_lock(&lck_grp_lock); - lockgroup_info_size = round_page(lck_grp_cnt * sizeof *lockgroup_info); + lockgroup_info_size = lck_grp_cnt * sizeof(*lockgroup_info); + lockgroup_info_vmsize = round_page(lockgroup_info_size); kr = kmem_alloc_pageable(ipc_kernel_map, - &lockgroup_info_addr, lockgroup_info_size); + &lockgroup_info_addr, lockgroup_info_vmsize, VM_KERN_MEMORY_IPC); if (kr != KERN_SUCCESS) { - mutex_unlock(&lck_grp_lock); + lck_mtx_unlock(&lck_grp_lock); return(kr); } @@ -847,12 +1188,10 @@ host_lockgroup_info( } *lockgroup_infoCntp = lck_grp_cnt; - mutex_unlock(&lck_grp_lock); + lck_mtx_unlock(&lck_grp_lock); - used = (*lockgroup_infoCntp) * sizeof *lockgroup_info; - - if (used != lockgroup_info_size) - bzero((char *) lockgroup_info, lockgroup_info_size - used); + if (lockgroup_info_size != lockgroup_info_vmsize) + bzero((char *)lockgroup_info, lockgroup_info_vmsize - lockgroup_info_size); kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)lockgroup_info_addr, (vm_map_size_t)lockgroup_info_size, TRUE, ©); @@ -863,204 +1202,3 @@ host_lockgroup_info( return(KERN_SUCCESS); } -/* - * Compatibility module - */ - -extern lck_rw_t *lock_alloc_EXT( boolean_t can_sleep, unsigned short tag0, unsigned short tag1); -extern void lock_done_EXT(lck_rw_t *lock); -extern void lock_free_EXT(lck_rw_t *lock); -extern void lock_init_EXT(lck_rw_t *lock, boolean_t can_sleep, unsigned short tag0, unsigned short tag1); -extern void lock_read_EXT(lck_rw_t *lock); -extern boolean_t lock_read_to_write_EXT(lck_rw_t *lock); -extern void lock_write_EXT(lck_rw_t *lock); -extern void lock_write_to_read_EXT(lck_rw_t *lock); -extern wait_result_t thread_sleep_lock_write_EXT( - event_t event, lck_rw_t *lock, wait_interrupt_t interruptible); - -extern lck_mtx_t *mutex_alloc_EXT(unsigned short tag); -extern void mutex_free_EXT(lck_mtx_t *mutex); -extern void mutex_init_EXT(lck_mtx_t *mutex, unsigned short tag); -extern void mutex_lock_EXT(lck_mtx_t *mutex); -extern boolean_t mutex_try_EXT(lck_mtx_t *mutex); -extern void mutex_unlock_EXT(lck_mtx_t *mutex); -extern wait_result_t thread_sleep_mutex_EXT( - event_t event, lck_mtx_t *mutex, wait_interrupt_t interruptible); -extern wait_result_t thread_sleep_mutex_deadline_EXT( - event_t event, lck_mtx_t *mutex, uint64_t deadline, wait_interrupt_t interruptible); - -extern void usimple_lock_EXT(lck_spin_t *lock); -extern void usimple_lock_init_EXT(lck_spin_t *lock, unsigned short tag); -extern unsigned int usimple_lock_try_EXT(lck_spin_t *lock); -extern void usimple_unlock_EXT(lck_spin_t *lock); -extern wait_result_t thread_sleep_usimple_lock_EXT(event_t event, lck_spin_t *lock, wait_interrupt_t interruptible); - -lck_rw_t * -lock_alloc_EXT( - __unused boolean_t can_sleep, - __unused unsigned short tag0, - __unused unsigned short tag1) -{ - return( lck_rw_alloc_init( &LockCompatGroup, LCK_ATTR_NULL)); -} - -void -lock_done_EXT( - lck_rw_t *lock) -{ - (void) lck_rw_done(lock); -} - -void -lock_free_EXT( - lck_rw_t *lock) -{ - lck_rw_free(lock, &LockCompatGroup); -} - -void -lock_init_EXT( - lck_rw_t *lock, - __unused boolean_t can_sleep, - __unused unsigned short tag0, - __unused unsigned short tag1) -{ - lck_rw_init(lock, &LockCompatGroup, LCK_ATTR_NULL); -} - -void -lock_read_EXT( - lck_rw_t *lock) -{ - lck_rw_lock_shared( lock); -} - -boolean_t -lock_read_to_write_EXT( - lck_rw_t *lock) -{ - return( lck_rw_lock_shared_to_exclusive(lock)); -} - -void -lock_write_EXT( - lck_rw_t *lock) -{ - lck_rw_lock_exclusive(lock); -} - -void -lock_write_to_read_EXT( - lck_rw_t *lock) -{ - lck_rw_lock_exclusive_to_shared(lock); -} - -wait_result_t -thread_sleep_lock_write_EXT( - event_t event, - lck_rw_t *lock, - wait_interrupt_t interruptible) -{ - return( lck_rw_sleep(lock, LCK_SLEEP_EXCLUSIVE, event, interruptible)); -} - -lck_mtx_t * -mutex_alloc_EXT( - __unused unsigned short tag) -{ - return(lck_mtx_alloc_init(&LockCompatGroup, LCK_ATTR_NULL)); -} - -void -mutex_free_EXT( - lck_mtx_t *mutex) -{ - lck_mtx_free(mutex, &LockCompatGroup); -} - -void -mutex_init_EXT( - lck_mtx_t *mutex, - __unused unsigned short tag) -{ - lck_mtx_init(mutex, &LockCompatGroup, LCK_ATTR_NULL); -} - -void -mutex_lock_EXT( - lck_mtx_t *mutex) -{ - lck_mtx_lock(mutex); -} - -boolean_t -mutex_try_EXT( - lck_mtx_t *mutex) -{ - return(lck_mtx_try_lock(mutex)); -} - -void -mutex_unlock_EXT( - lck_mtx_t *mutex) -{ - lck_mtx_unlock(mutex); -} - -wait_result_t -thread_sleep_mutex_EXT( - event_t event, - lck_mtx_t *mutex, - wait_interrupt_t interruptible) -{ - return( lck_mtx_sleep(mutex, LCK_SLEEP_DEFAULT, event, interruptible)); -} - -wait_result_t -thread_sleep_mutex_deadline_EXT( - event_t event, - lck_mtx_t *mutex, - uint64_t deadline, - wait_interrupt_t interruptible) -{ - return( lck_mtx_sleep_deadline(mutex, LCK_SLEEP_DEFAULT, event, interruptible, deadline)); -} - -void -usimple_lock_EXT( - lck_spin_t *lock) -{ - lck_spin_lock(lock); -} - -void -usimple_lock_init_EXT( - lck_spin_t *lock, - __unused unsigned short tag) -{ - lck_spin_init(lock, &LockCompatGroup, LCK_ATTR_NULL); -} - -unsigned int -usimple_lock_try_EXT( - lck_spin_t *lock) -{ - return(lck_spin_try_lock(lock)); -} - -void -usimple_unlock_EXT( - lck_spin_t *lock) -{ - lck_spin_unlock(lock); -} - -wait_result_t -thread_sleep_usimple_lock_EXT( - event_t event, - lck_spin_t *lock, - wait_interrupt_t interruptible) -{ - return( lck_spin_sleep(lock, LCK_SLEEP_DEFAULT, event, interruptible)); -}