]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/i386/locks_i386.c
xnu-3789.1.32.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
64#include <mach_ldebug.h>
65
66#include <kern/locks.h>
67#include <kern/kalloc.h>
68#include <kern/misc_protos.h>
69#include <kern/thread.h>
70#include <kern/processor.h>
71#include <kern/cpu_data.h>
72#include <kern/cpu_number.h>
73#include <kern/sched_prim.h>
74#include <kern/xpr.h>
75#include <kern/debug.h>
76#include <string.h>
77
78#include <i386/machine_routines.h> /* machine_timeout_suspended() */
79#include <machine/machine_cpu.h>
80#include <i386/mp.h>
81
82#include <sys/kdebug.h>
83#include <mach/branch_predicates.h>
84
85/*
86 * We need only enough declarations from the BSD-side to be able to
87 * test if our probe is active, and to call __dtrace_probe(). Setting
88 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
89 */
90#if CONFIG_DTRACE
91#define NEED_DTRACE_DEFS
92#include <../bsd/sys/lockstat.h>
93#endif
94
95#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
96#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
97#define LCK_RW_LCK_SHARED_CODE 0x102
98#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
99#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
100#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
101
102#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
103#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
104#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
105#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
106#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
107#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
108#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
109#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
110
111
112#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
113
114unsigned int LcksOpts=0;
115
116/* Forwards */
117
118#if USLOCK_DEBUG
119/*
120 * Perform simple lock checks.
121 */
122int uslock_check = 1;
123int max_lock_loops = 100000000;
124decl_simple_lock_data(extern , printf_lock)
125decl_simple_lock_data(extern , panic_lock)
126#endif /* USLOCK_DEBUG */
127
128extern unsigned int not_in_kdp;
129
130/*
131 * We often want to know the addresses of the callers
132 * of the various lock routines. However, this information
133 * is only used for debugging and statistics.
134 */
135typedef void *pc_t;
136#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
137#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
138#if ANY_LOCK_DEBUG
139#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
140#define DECL_PC(pc) pc_t pc;
141#else /* ANY_LOCK_DEBUG */
142#define DECL_PC(pc)
143#ifdef lint
144/*
145 * Eliminate lint complaints about unused local pc variables.
146 */
147#define OBTAIN_PC(pc) ++pc
148#else /* lint */
149#define OBTAIN_PC(pc)
150#endif /* lint */
151#endif /* USLOCK_DEBUG */
152
153
154/*
155 * Portable lock package implementation of usimple_locks.
156 */
157
158#if USLOCK_DEBUG
159#define USLDBG(stmt) stmt
160void usld_lock_init(usimple_lock_t, unsigned short);
161void usld_lock_pre(usimple_lock_t, pc_t);
162void usld_lock_post(usimple_lock_t, pc_t);
163void usld_unlock(usimple_lock_t, pc_t);
164void usld_lock_try_pre(usimple_lock_t, pc_t);
165void usld_lock_try_post(usimple_lock_t, pc_t);
166int usld_lock_common_checks(usimple_lock_t, char *);
167#else /* USLOCK_DEBUG */
168#define USLDBG(stmt)
169#endif /* USLOCK_DEBUG */
170
171
172extern int lck_rw_grab_want(lck_rw_t *lck);
173extern int lck_rw_grab_shared(lck_rw_t *lck);
174extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
175
176
177/*
178 * Forward definitions
179 */
180
181void lck_rw_lock_shared_gen(
182 lck_rw_t *lck);
183
184void lck_rw_lock_exclusive_gen(
185 lck_rw_t *lck);
186
187boolean_t lck_rw_lock_shared_to_exclusive_success(
188 lck_rw_t *lck);
189
190boolean_t lck_rw_lock_shared_to_exclusive_failure(
191 lck_rw_t *lck,
192 int prior_lock_state);
193
194void lck_rw_lock_exclusive_to_shared_gen(
195 lck_rw_t *lck,
196 int prior_lock_state);
197
198lck_rw_type_t lck_rw_done_gen(
199 lck_rw_t *lck,
200 int prior_lock_state);
201
202void lck_rw_clear_promotions_x86(thread_t thread);
203
204/*
205 * Routine: lck_spin_alloc_init
206 */
207lck_spin_t *
208lck_spin_alloc_init(
209 lck_grp_t *grp,
210 lck_attr_t *attr)
211{
212 lck_spin_t *lck;
213
214 if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
215 lck_spin_init(lck, grp, attr);
216
217 return(lck);
218}
219
220/*
221 * Routine: lck_spin_free
222 */
223void
224lck_spin_free(
225 lck_spin_t *lck,
226 lck_grp_t *grp)
227{
228 lck_spin_destroy(lck, grp);
229 kfree(lck, sizeof(lck_spin_t));
230}
231
232/*
233 * Routine: lck_spin_init
234 */
235void
236lck_spin_init(
237 lck_spin_t *lck,
238 lck_grp_t *grp,
239 __unused lck_attr_t *attr)
240{
241 usimple_lock_init((usimple_lock_t) lck, 0);
242 lck_grp_reference(grp);
243 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
244}
245
246/*
247 * Routine: lck_spin_destroy
248 */
249void
250lck_spin_destroy(
251 lck_spin_t *lck,
252 lck_grp_t *grp)
253{
254 if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
255 return;
256 lck->interlock = LCK_SPIN_TAG_DESTROYED;
257 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
258 lck_grp_deallocate(grp);
259 return;
260}
261
262/*
263 * Routine: lck_spin_lock
264 */
265void
266lck_spin_lock(
267 lck_spin_t *lck)
268{
269 usimple_lock((usimple_lock_t) lck);
270}
271
272/*
273 * Routine: lck_spin_unlock
274 */
275void
276lck_spin_unlock(
277 lck_spin_t *lck)
278{
279 usimple_unlock((usimple_lock_t) lck);
280}
281
282
283/*
284 * Routine: lck_spin_try_lock
285 */
286boolean_t
287lck_spin_try_lock(
288 lck_spin_t *lck)
289{
290 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
291#if DEVELOPMENT || DEBUG
292 if (lrval) {
293 pltrace(FALSE);
294 }
295#endif
296 return(lrval);
297}
298
299/*
300 * Routine: lck_spin_assert
301 */
302void
303lck_spin_assert(lck_spin_t *lock, unsigned int type)
304{
305 thread_t thread, holder;
306 uintptr_t state;
307
308 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
309 panic("lck_spin_assert(): invalid arg (%u)", type);
310 }
311
312 state = lock->interlock;
313 holder = (thread_t)state;
314 thread = current_thread();
315 if (type == LCK_ASSERT_OWNED) {
316 if (__improbable(holder == THREAD_NULL)) {
317 panic("Lock not owned %p = %lx", lock, state);
318 }
319 if (__improbable(holder != thread)) {
320 panic("Lock not owned by current thread %p = %lx", lock, state);
321 }
322 } else if (type == LCK_ASSERT_NOTOWNED) {
323 if (__improbable(holder != THREAD_NULL)) {
324 if (holder == thread) {
325 panic("Lock owned by current thread %p = %lx", lock, state);
326 } else {
327 panic("Lock %p owned by thread %p", lock, holder);
328 }
329 }
330 }
331}
332
333/*
334 * Routine: kdp_lck_spin_is_acquired
335 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
336 * Returns: TRUE if lock is acquired.
337 */
338boolean_t
339kdp_lck_spin_is_acquired(lck_spin_t *lck) {
340 if (not_in_kdp) {
341 panic("panic: spinlock acquired check done outside of kernel debugger");
342 }
343 return (lck->interlock != 0)? TRUE : FALSE;
344}
345
346/*
347 * Initialize a usimple_lock.
348 *
349 * No change in preemption state.
350 */
351void
352usimple_lock_init(
353 usimple_lock_t l,
354 __unused unsigned short tag)
355{
356#ifndef MACHINE_SIMPLE_LOCK
357 USLDBG(usld_lock_init(l, tag));
358 hw_lock_init(&l->interlock);
359#else
360 simple_lock_init((simple_lock_t)l,tag);
361#endif
362}
363
364volatile uint32_t spinlock_owner_cpu = ~0;
365volatile usimple_lock_t spinlock_timed_out;
366
367uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
368 uint64_t deadline;
369 uint32_t i;
370
371 for (i = 0; i < real_ncpus; i++) {
372 if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
373 spinlock_owner_cpu = i;
374 if ((uint32_t) cpu_number() == i)
375 break;
376 cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
377 cpu_NMI_interrupt(i);
378 deadline = mach_absolute_time() + (LockTimeOut * 2);
379 while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
380 cpu_pause();
381 break;
382 }
383 }
384
385 return spinlock_owner_cpu;
386}
387
388/*
389 * Acquire a usimple_lock.
390 *
391 * Returns with preemption disabled. Note
392 * that the hw_lock routines are responsible for
393 * maintaining preemption state.
394 */
395void
396usimple_lock(
397 usimple_lock_t l)
398{
399#ifndef MACHINE_SIMPLE_LOCK
400 DECL_PC(pc);
401
402 OBTAIN_PC(pc);
403 USLDBG(usld_lock_pre(l, pc));
404
405 if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0)) {
406 boolean_t uslock_acquired = FALSE;
407 while (machine_timeout_suspended()) {
408 enable_preemption();
409 if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
410 break;
411 }
412
413 if (uslock_acquired == FALSE) {
414 uint32_t lock_cpu;
415 uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
416 spinlock_timed_out = l;
417 lock_cpu = spinlock_timeout_NMI(lowner);
418 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
419 }
420 }
421#if DEVELOPMENT || DEBUG
422 pltrace(FALSE);
423#endif
424
425 USLDBG(usld_lock_post(l, pc));
426#else
427 simple_lock((simple_lock_t)l);
428#endif
429}
430
431
432/*
433 * Release a usimple_lock.
434 *
435 * Returns with preemption enabled. Note
436 * that the hw_lock routines are responsible for
437 * maintaining preemption state.
438 */
439void
440usimple_unlock(
441 usimple_lock_t l)
442{
443#ifndef MACHINE_SIMPLE_LOCK
444 DECL_PC(pc);
445
446 OBTAIN_PC(pc);
447 USLDBG(usld_unlock(l, pc));
448#if DEVELOPMENT || DEBUG
449 pltrace(TRUE);
450#endif
451 hw_lock_unlock(&l->interlock);
452#else
453 simple_unlock_rwmb((simple_lock_t)l);
454#endif
455}
456
457
458/*
459 * Conditionally acquire a usimple_lock.
460 *
461 * On success, returns with preemption disabled.
462 * On failure, returns with preemption in the same state
463 * as when first invoked. Note that the hw_lock routines
464 * are responsible for maintaining preemption state.
465 *
466 * XXX No stats are gathered on a miss; I preserved this
467 * behavior from the original assembly-language code, but
468 * doesn't it make sense to log misses? XXX
469 */
470unsigned int
471usimple_lock_try(
472 usimple_lock_t l)
473{
474#ifndef MACHINE_SIMPLE_LOCK
475 unsigned int success;
476 DECL_PC(pc);
477
478 OBTAIN_PC(pc);
479 USLDBG(usld_lock_try_pre(l, pc));
480 if ((success = hw_lock_try(&l->interlock))) {
481#if DEVELOPMENT || DEBUG
482 pltrace(FALSE);
483#endif
484 USLDBG(usld_lock_try_post(l, pc));
485 }
486 return success;
487#else
488 return(simple_lock_try((simple_lock_t)l));
489#endif
490}
491
492/*
493 * Acquire a usimple_lock while polling for pending TLB flushes
494 * and spinning on a lock.
495 *
496 */
497void
498usimple_lock_try_lock_loop(usimple_lock_t l)
499{
500 boolean_t istate = ml_get_interrupts_enabled();
501 while (!simple_lock_try((l))) {
502 if (!istate)
503 handle_pending_TLB_flushes();
504 cpu_pause();
505 }
506}
507
508#if USLOCK_DEBUG
509/*
510 * States of a usimple_lock. The default when initializing
511 * a usimple_lock is setting it up for debug checking.
512 */
513#define USLOCK_CHECKED 0x0001 /* lock is being checked */
514#define USLOCK_TAKEN 0x0002 /* lock has been taken */
515#define USLOCK_INIT 0xBAA0 /* lock has been initialized */
516#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
517#define USLOCK_CHECKING(l) (uslock_check && \
518 ((l)->debug.state & USLOCK_CHECKED))
519
520/*
521 * Trace activities of a particularly interesting lock.
522 */
523void usl_trace(usimple_lock_t, int, pc_t, const char *);
524
525
526/*
527 * Initialize the debugging information contained
528 * in a usimple_lock.
529 */
530void
531usld_lock_init(
532 usimple_lock_t l,
533 __unused unsigned short tag)
534{
535 if (l == USIMPLE_LOCK_NULL)
536 panic("lock initialization: null lock pointer");
537 l->lock_type = USLOCK_TAG;
538 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
539 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
540 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
541 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
542 l->debug.duration[0] = l->debug.duration[1] = 0;
543 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
544 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
545 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
546}
547
548
549/*
550 * These checks apply to all usimple_locks, not just
551 * those with USLOCK_CHECKED turned on.
552 */
553int
554usld_lock_common_checks(
555 usimple_lock_t l,
556 char *caller)
557{
558 if (l == USIMPLE_LOCK_NULL)
559 panic("%s: null lock pointer", caller);
560 if (l->lock_type != USLOCK_TAG)
561 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
562 if (!(l->debug.state & USLOCK_INIT))
563 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
564 return USLOCK_CHECKING(l);
565}
566
567
568/*
569 * Debug checks on a usimple_lock just before attempting
570 * to acquire it.
571 */
572/* ARGSUSED */
573void
574usld_lock_pre(
575 usimple_lock_t l,
576 pc_t pc)
577{
578 char caller[] = "usimple_lock";
579
580
581 if (!usld_lock_common_checks(l, caller))
582 return;
583
584/*
585 * Note that we have a weird case where we are getting a lock when we are]
586 * in the process of putting the system to sleep. We are running with no
587 * current threads, therefore we can't tell if we are trying to retake a lock
588 * we have or someone on the other processor has it. Therefore we just
589 * ignore this test if the locking thread is 0.
590 */
591
592 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
593 l->debug.lock_thread == (void *) current_thread()) {
594 printf("%s: lock %p already locked (at %p) by",
595 caller, l, l->debug.lock_pc);
596 printf(" current thread %p (new attempt at pc %p)\n",
597 l->debug.lock_thread, pc);
598 panic("%s", caller);
599 }
600 mp_disable_preemption();
601 usl_trace(l, cpu_number(), pc, caller);
602 mp_enable_preemption();
603}
604
605
606/*
607 * Debug checks on a usimple_lock just after acquiring it.
608 *
609 * Pre-emption has been disabled at this point,
610 * so we are safe in using cpu_number.
611 */
612void
613usld_lock_post(
614 usimple_lock_t l,
615 pc_t pc)
616{
617 int mycpu;
618 char caller[] = "successful usimple_lock";
619
620
621 if (!usld_lock_common_checks(l, caller))
622 return;
623
624 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
625 panic("%s: lock %p became uninitialized",
626 caller, l);
627 if ((l->debug.state & USLOCK_TAKEN))
628 panic("%s: lock 0x%p became TAKEN by someone else",
629 caller, l);
630
631 mycpu = cpu_number();
632 l->debug.lock_thread = (void *)current_thread();
633 l->debug.state |= USLOCK_TAKEN;
634 l->debug.lock_pc = pc;
635 l->debug.lock_cpu = mycpu;
636
637 usl_trace(l, mycpu, pc, caller);
638}
639
640
641/*
642 * Debug checks on a usimple_lock just before
643 * releasing it. Note that the caller has not
644 * yet released the hardware lock.
645 *
646 * Preemption is still disabled, so there's
647 * no problem using cpu_number.
648 */
649void
650usld_unlock(
651 usimple_lock_t l,
652 pc_t pc)
653{
654 int mycpu;
655 char caller[] = "usimple_unlock";
656
657
658 if (!usld_lock_common_checks(l, caller))
659 return;
660
661 mycpu = cpu_number();
662
663 if (!(l->debug.state & USLOCK_TAKEN))
664 panic("%s: lock 0x%p hasn't been taken",
665 caller, l);
666 if (l->debug.lock_thread != (void *) current_thread())
667 panic("%s: unlocking lock 0x%p, owned by thread %p",
668 caller, l, l->debug.lock_thread);
669 if (l->debug.lock_cpu != mycpu) {
670 printf("%s: unlocking lock 0x%p on cpu 0x%x",
671 caller, l, mycpu);
672 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
673 panic("%s", caller);
674 }
675 usl_trace(l, mycpu, pc, caller);
676
677 l->debug.unlock_thread = l->debug.lock_thread;
678 l->debug.lock_thread = INVALID_PC;
679 l->debug.state &= ~USLOCK_TAKEN;
680 l->debug.unlock_pc = pc;
681 l->debug.unlock_cpu = mycpu;
682}
683
684
685/*
686 * Debug checks on a usimple_lock just before
687 * attempting to acquire it.
688 *
689 * Preemption isn't guaranteed to be disabled.
690 */
691void
692usld_lock_try_pre(
693 usimple_lock_t l,
694 pc_t pc)
695{
696 char caller[] = "usimple_lock_try";
697
698 if (!usld_lock_common_checks(l, caller))
699 return;
700 mp_disable_preemption();
701 usl_trace(l, cpu_number(), pc, caller);
702 mp_enable_preemption();
703}
704
705
706/*
707 * Debug checks on a usimple_lock just after
708 * successfully attempting to acquire it.
709 *
710 * Preemption has been disabled by the
711 * lock acquisition attempt, so it's safe
712 * to use cpu_number.
713 */
714void
715usld_lock_try_post(
716 usimple_lock_t l,
717 pc_t pc)
718{
719 int mycpu;
720 char caller[] = "successful usimple_lock_try";
721
722 if (!usld_lock_common_checks(l, caller))
723 return;
724
725 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
726 panic("%s: lock 0x%p became uninitialized",
727 caller, l);
728 if ((l->debug.state & USLOCK_TAKEN))
729 panic("%s: lock 0x%p became TAKEN by someone else",
730 caller, l);
731
732 mycpu = cpu_number();
733 l->debug.lock_thread = (void *) current_thread();
734 l->debug.state |= USLOCK_TAKEN;
735 l->debug.lock_pc = pc;
736 l->debug.lock_cpu = mycpu;
737
738 usl_trace(l, mycpu, pc, caller);
739}
740
741
742/*
743 * For very special cases, set traced_lock to point to a
744 * specific lock of interest. The result is a series of
745 * XPRs showing lock operations on that lock. The lock_seq
746 * value is used to show the order of those operations.
747 */
748usimple_lock_t traced_lock;
749unsigned int lock_seq;
750
751void
752usl_trace(
753 usimple_lock_t l,
754 int mycpu,
755 pc_t pc,
756 const char * op_name)
757{
758 if (traced_lock == l) {
759 XPR(XPR_SLOCK,
760 "seq %d, cpu %d, %s @ %x\n",
761 (uintptr_t) lock_seq, (uintptr_t) mycpu,
762 (uintptr_t) op_name, (uintptr_t) pc, 0);
763 lock_seq++;
764 }
765}
766
767
768#endif /* USLOCK_DEBUG */
769
770/*
771 * Routine: lck_rw_alloc_init
772 */
773lck_rw_t *
774lck_rw_alloc_init(
775 lck_grp_t *grp,
776 lck_attr_t *attr) {
777 lck_rw_t *lck;
778
779 if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
780 bzero(lck, sizeof(lck_rw_t));
781 lck_rw_init(lck, grp, attr);
782 }
783
784 return(lck);
785}
786
787/*
788 * Routine: lck_rw_free
789 */
790void
791lck_rw_free(
792 lck_rw_t *lck,
793 lck_grp_t *grp) {
794 lck_rw_destroy(lck, grp);
795 kfree(lck, sizeof(lck_rw_t));
796}
797
798/*
799 * Routine: lck_rw_init
800 */
801void
802lck_rw_init(
803 lck_rw_t *lck,
804 lck_grp_t *grp,
805 lck_attr_t *attr)
806{
807 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
808 attr : &LockDefaultLckAttr;
809
810 hw_lock_byte_init(&lck->lck_rw_interlock);
811 lck->lck_rw_want_write = FALSE;
812 lck->lck_rw_want_upgrade = FALSE;
813 lck->lck_rw_shared_count = 0;
814 lck->lck_rw_can_sleep = TRUE;
815 lck->lck_r_waiting = lck->lck_w_waiting = 0;
816 lck->lck_rw_tag = 0;
817 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
818 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
819
820 lck_grp_reference(grp);
821 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
822}
823
824/*
825 * Routine: lck_rw_destroy
826 */
827void
828lck_rw_destroy(
829 lck_rw_t *lck,
830 lck_grp_t *grp)
831{
832 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
833 return;
834#if MACH_LDEBUG
835 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
836#endif
837 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
838 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
839 lck_grp_deallocate(grp);
840 return;
841}
842
843/*
844 * Sleep locks. These use the same data structure and algorithm
845 * as the spin locks, but the process sleeps while it is waiting
846 * for the lock. These work on uniprocessor systems.
847 */
848
849#define DECREMENTER_TIMEOUT 1000000
850
851#define RW_LOCK_READER_EVENT(x) \
852 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
853
854#define RW_LOCK_WRITER_EVENT(x) \
855 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
856
857/*
858 * We disable interrupts while holding the RW interlock to prevent an
859 * interrupt from exacerbating hold time.
860 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
861 */
862static boolean_t
863lck_interlock_lock(lck_rw_t *lck)
864{
865 boolean_t istate;
866
867 istate = ml_set_interrupts_enabled(FALSE);
868 hw_lock_byte_lock(&lck->lck_rw_interlock);
869
870 return istate;
871}
872
873static void
874lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
875{
876 hw_lock_byte_unlock(&lck->lck_rw_interlock);
877 ml_set_interrupts_enabled(istate);
878}
879
880/*
881 * This inline is used when busy-waiting for an rw lock.
882 * If interrupts were disabled when the lock primitive was called,
883 * we poll the IPI handler for pending tlb flushes.
884 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
885 */
886static inline void
887lck_rw_lock_pause(boolean_t interrupts_enabled)
888{
889 if (!interrupts_enabled)
890 handle_pending_TLB_flushes();
891 cpu_pause();
892}
893
894
895/*
896 * compute the deadline to spin against when
897 * waiting for a change of state on a lck_rw_t
898 */
899static inline uint64_t
900lck_rw_deadline_for_spin(lck_rw_t *lck)
901{
902 if (lck->lck_rw_can_sleep) {
903 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
904 /*
905 * there are already threads waiting on this lock... this
906 * implies that they have spun beyond their deadlines waiting for
907 * the desired state to show up so we will not bother spinning at this time...
908 * or
909 * the current number of threads sharing this lock exceeds our capacity to run them
910 * concurrently and since all states we're going to spin for require the rw_shared_count
911 * to be at 0, we'll not bother spinning since the latency for this to happen is
912 * unpredictable...
913 */
914 return (mach_absolute_time());
915 }
916 return (mach_absolute_time() + MutexSpin);
917 } else
918 return (mach_absolute_time() + (100000LL * 1000000000LL));
919}
920
921
922/*
923 * Routine: lck_rw_lock_exclusive
924 */
925void
926lck_rw_lock_exclusive_gen(
927 lck_rw_t *lck)
928{
929 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
930 uint64_t deadline = 0;
931 int slept = 0;
932 int gotlock = 0;
933 int lockheld = 0;
934 wait_result_t res = 0;
935 boolean_t istate = -1;
936
937#if CONFIG_DTRACE
938 boolean_t dtrace_ls_initialized = FALSE;
939 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
940 uint64_t wait_interval = 0;
941 int readers_at_sleep = 0;
942#endif
943
944 /*
945 * Try to acquire the lck_rw_want_write bit.
946 */
947 while ( !lck_rw_grab_want(lck)) {
948
949#if CONFIG_DTRACE
950 if (dtrace_ls_initialized == FALSE) {
951 dtrace_ls_initialized = TRUE;
952 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
953 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
954 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
955 if (dtrace_ls_enabled) {
956 /*
957 * Either sleeping or spinning is happening,
958 * start a timing of our delay interval now.
959 */
960 readers_at_sleep = lck->lck_rw_shared_count;
961 wait_interval = mach_absolute_time();
962 }
963 }
964#endif
965 if (istate == -1)
966 istate = ml_get_interrupts_enabled();
967
968 deadline = lck_rw_deadline_for_spin(lck);
969
970 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
971
972 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
973 lck_rw_lock_pause(istate);
974
975 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
976
977 if (gotlock)
978 break;
979 /*
980 * if we get here, the deadline has expired w/o us
981 * being able to grab the lock exclusively
982 * check to see if we're allowed to do a thread_block
983 */
984 if (lck->lck_rw_can_sleep) {
985
986 istate = lck_interlock_lock(lck);
987
988 if (lck->lck_rw_want_write) {
989
990 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
991
992 lck->lck_w_waiting = TRUE;
993
994 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
995 lck_interlock_unlock(lck, istate);
996
997 if (res == THREAD_WAITING) {
998 res = thread_block(THREAD_CONTINUE_NULL);
999 slept++;
1000 }
1001 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1002 } else {
1003 lck->lck_rw_want_write = TRUE;
1004 lck_interlock_unlock(lck, istate);
1005 break;
1006 }
1007 }
1008 }
1009 /*
1010 * Wait for readers (and upgrades) to finish...
1011 * the test for these conditions must be done simultaneously with
1012 * a check of the interlock not being held since
1013 * the rw_shared_count will drop to 0 first and then want_upgrade
1014 * will be set to 1 in the shared_to_exclusive scenario... those
1015 * adjustments are done behind the interlock and represent an
1016 * atomic change in state and must be considered as such
1017 * however, once we see the read count at 0, the want_upgrade not set
1018 * and the interlock not held, we are safe to proceed
1019 */
1020 while (lck_rw_held_read_or_upgrade(lck)) {
1021
1022#if CONFIG_DTRACE
1023 /*
1024 * Either sleeping or spinning is happening, start
1025 * a timing of our delay interval now. If we set it
1026 * to -1 we don't have accurate data so we cannot later
1027 * decide to record a dtrace spin or sleep event.
1028 */
1029 if (dtrace_ls_initialized == FALSE) {
1030 dtrace_ls_initialized = TRUE;
1031 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1032 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1033 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1034 if (dtrace_ls_enabled) {
1035 /*
1036 * Either sleeping or spinning is happening,
1037 * start a timing of our delay interval now.
1038 */
1039 readers_at_sleep = lck->lck_rw_shared_count;
1040 wait_interval = mach_absolute_time();
1041 }
1042 }
1043#endif
1044 if (istate == -1)
1045 istate = ml_get_interrupts_enabled();
1046
1047 deadline = lck_rw_deadline_for_spin(lck);
1048
1049 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1050
1051 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1052 lck_rw_lock_pause(istate);
1053
1054 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1055
1056 if ( !lockheld)
1057 break;
1058 /*
1059 * if we get here, the deadline has expired w/o us
1060 * being able to grab the lock exclusively
1061 * check to see if we're allowed to do a thread_block
1062 */
1063 if (lck->lck_rw_can_sleep) {
1064
1065 istate = lck_interlock_lock(lck);
1066
1067 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1068 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1069
1070 lck->lck_w_waiting = TRUE;
1071
1072 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1073 lck_interlock_unlock(lck, istate);
1074
1075 if (res == THREAD_WAITING) {
1076 res = thread_block(THREAD_CONTINUE_NULL);
1077 slept++;
1078 }
1079 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1080 } else {
1081 lck_interlock_unlock(lck, istate);
1082 /*
1083 * must own the lock now, since we checked for
1084 * readers or upgrade owner behind the interlock
1085 * no need for a call to 'lck_rw_held_read_or_upgrade'
1086 */
1087 break;
1088 }
1089 }
1090 }
1091
1092#if CONFIG_DTRACE
1093 /*
1094 * Decide what latencies we suffered that are Dtrace events.
1095 * If we have set wait_interval, then we either spun or slept.
1096 * At least we get out from under the interlock before we record
1097 * which is the best we can do here to minimize the impact
1098 * of the tracing.
1099 * If we have set wait_interval to -1, then dtrace was not enabled when we
1100 * started sleeping/spinning so we don't record this event.
1101 */
1102 if (dtrace_ls_enabled == TRUE) {
1103 if (slept == 0) {
1104 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1105 mach_absolute_time() - wait_interval, 1);
1106 } else {
1107 /*
1108 * For the blocking case, we also record if when we blocked
1109 * it was held for read or write, and how many readers.
1110 * Notice that above we recorded this before we dropped
1111 * the interlock so the count is accurate.
1112 */
1113 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1114 mach_absolute_time() - wait_interval, 1,
1115 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1116 }
1117 }
1118 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1119#endif
1120}
1121
1122
1123/*
1124 * Routine: lck_rw_done_gen
1125 *
1126 * called from the assembly language wrapper...
1127 * prior_lock_state is the value in the 1st
1128 * word of the lock at the time of a successful
1129 * atomic compare and exchange with the new value...
1130 * it represents the state of the lock before we
1131 * decremented the rw_shared_count or cleared either
1132 * rw_want_upgrade or rw_want_write and
1133 * the lck_x_waiting bits... since the wrapper
1134 * routine has already changed the state atomically,
1135 * we just need to decide if we should
1136 * wake up anyone and what value to return... we do
1137 * this by examining the state of the lock before
1138 * we changed it
1139 */
1140lck_rw_type_t
1141lck_rw_done_gen(
1142 lck_rw_t *lck,
1143 int prior_lock_state)
1144{
1145 lck_rw_t *fake_lck;
1146 lck_rw_type_t lock_type;
1147 thread_t thread;
1148 uint32_t rwlock_count;
1149
1150 /*
1151 * prior_lock state is a snapshot of the 1st word of the
1152 * lock in question... we'll fake up a pointer to it
1153 * and carefully not access anything beyond whats defined
1154 * in the first word of a lck_rw_t
1155 */
1156 fake_lck = (lck_rw_t *)&prior_lock_state;
1157
1158 if (fake_lck->lck_rw_shared_count <= 1) {
1159 if (fake_lck->lck_w_waiting)
1160 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1161
1162 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1163 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1164 }
1165 if (fake_lck->lck_rw_shared_count)
1166 lock_type = LCK_RW_TYPE_SHARED;
1167 else
1168 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1169
1170 /* Check if dropping the lock means that we need to unpromote */
1171 thread = current_thread();
1172 rwlock_count = thread->rwlock_count--;
1173#if MACH_LDEBUG
1174 if (rwlock_count == 0) {
1175 panic("rw lock count underflow for thread %p", thread);
1176 }
1177#endif
1178 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1179 /* sched_flags checked without lock, but will be rechecked while clearing */
1180 lck_rw_clear_promotion(thread);
1181 }
1182
1183#if CONFIG_DTRACE
1184 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1185#endif
1186
1187 return(lock_type);
1188}
1189
1190
1191/*
1192 * Routine: lck_rw_unlock
1193 */
1194void
1195lck_rw_unlock(
1196 lck_rw_t *lck,
1197 lck_rw_type_t lck_rw_type)
1198{
1199 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1200 lck_rw_unlock_shared(lck);
1201 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1202 lck_rw_unlock_exclusive(lck);
1203 else
1204 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1205}
1206
1207
1208/*
1209 * Routine: lck_rw_unlock_shared
1210 */
1211void
1212lck_rw_unlock_shared(
1213 lck_rw_t *lck)
1214{
1215 lck_rw_type_t ret;
1216
1217 ret = lck_rw_done(lck);
1218
1219 if (ret != LCK_RW_TYPE_SHARED)
1220 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1221}
1222
1223
1224/*
1225 * Routine: lck_rw_unlock_exclusive
1226 */
1227void
1228lck_rw_unlock_exclusive(
1229 lck_rw_t *lck)
1230{
1231 lck_rw_type_t ret;
1232
1233 ret = lck_rw_done(lck);
1234
1235 if (ret != LCK_RW_TYPE_EXCLUSIVE)
1236 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1237}
1238
1239
1240/*
1241 * Routine: lck_rw_lock
1242 */
1243void
1244lck_rw_lock(
1245 lck_rw_t *lck,
1246 lck_rw_type_t lck_rw_type)
1247{
1248 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1249 lck_rw_lock_shared(lck);
1250 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1251 lck_rw_lock_exclusive(lck);
1252 else
1253 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1254}
1255
1256
1257/*
1258 * Routine: lck_rw_lock_shared_gen
1259 * Function:
1260 * assembly fast path code has determined that this lock
1261 * is held exclusively... this is where we spin/block
1262 * until we can acquire the lock in the shared mode
1263 */
1264void
1265lck_rw_lock_shared_gen(
1266 lck_rw_t *lck)
1267{
1268 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1269 uint64_t deadline = 0;
1270 int gotlock = 0;
1271 int slept = 0;
1272 wait_result_t res = 0;
1273 boolean_t istate = -1;
1274
1275#if CONFIG_DTRACE
1276 uint64_t wait_interval = 0;
1277 int readers_at_sleep = 0;
1278 boolean_t dtrace_ls_initialized = FALSE;
1279 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1280#endif
1281
1282 while ( !lck_rw_grab_shared(lck)) {
1283
1284#if CONFIG_DTRACE
1285 if (dtrace_ls_initialized == FALSE) {
1286 dtrace_ls_initialized = TRUE;
1287 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1288 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1289 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1290 if (dtrace_ls_enabled) {
1291 /*
1292 * Either sleeping or spinning is happening,
1293 * start a timing of our delay interval now.
1294 */
1295 readers_at_sleep = lck->lck_rw_shared_count;
1296 wait_interval = mach_absolute_time();
1297 }
1298 }
1299#endif
1300 if (istate == -1)
1301 istate = ml_get_interrupts_enabled();
1302
1303 deadline = lck_rw_deadline_for_spin(lck);
1304
1305 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1306 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1307
1308 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1309 lck_rw_lock_pause(istate);
1310
1311 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1312 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1313
1314 if (gotlock)
1315 break;
1316 /*
1317 * if we get here, the deadline has expired w/o us
1318 * being able to grab the lock for read
1319 * check to see if we're allowed to do a thread_block
1320 */
1321 if (lck->lck_rw_can_sleep) {
1322
1323 istate = lck_interlock_lock(lck);
1324
1325 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1326 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1327
1328 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1329 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1330
1331 lck->lck_r_waiting = TRUE;
1332
1333 res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1334 lck_interlock_unlock(lck, istate);
1335
1336 if (res == THREAD_WAITING) {
1337 res = thread_block(THREAD_CONTINUE_NULL);
1338 slept++;
1339 }
1340 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1341 trace_lck, res, slept, 0, 0);
1342 } else {
1343 lck->lck_rw_shared_count++;
1344 lck_interlock_unlock(lck, istate);
1345 break;
1346 }
1347 }
1348 }
1349
1350#if CONFIG_DTRACE
1351 if (dtrace_ls_enabled == TRUE) {
1352 if (slept == 0) {
1353 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1354 } else {
1355 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1356 mach_absolute_time() - wait_interval, 0,
1357 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1358 }
1359 }
1360 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1361#endif
1362}
1363
1364
1365/*
1366 * Routine: lck_rw_lock_shared_to_exclusive_failure
1367 * Function:
1368 * assembly fast path code has already dropped our read
1369 * count and determined that someone else owns 'lck_rw_want_upgrade'
1370 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1371 * all we need to do here is determine if a wakeup is needed
1372 */
1373boolean_t
1374lck_rw_lock_shared_to_exclusive_failure(
1375 lck_rw_t *lck,
1376 int prior_lock_state)
1377{
1378 lck_rw_t *fake_lck;
1379 thread_t thread = current_thread();
1380 uint32_t rwlock_count;
1381
1382 /* Check if dropping the lock means that we need to unpromote */
1383 rwlock_count = thread->rwlock_count--;
1384#if MACH_LDEBUG
1385 if (rwlock_count == 0) {
1386 panic("rw lock count underflow for thread %p", thread);
1387 }
1388#endif
1389 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1390 /* sched_flags checked without lock, but will be rechecked while clearing */
1391 lck_rw_clear_promotion(thread);
1392 }
1393
1394 /*
1395 * prior_lock state is a snapshot of the 1st word of the
1396 * lock in question... we'll fake up a pointer to it
1397 * and carefully not access anything beyond whats defined
1398 * in the first word of a lck_rw_t
1399 */
1400 fake_lck = (lck_rw_t *)&prior_lock_state;
1401
1402 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1403 /*
1404 * Someone else has requested upgrade.
1405 * Since we've released the read lock, wake
1406 * him up if he's blocked waiting
1407 */
1408 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1409 }
1410 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1411 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1412
1413 return (FALSE);
1414}
1415
1416
1417/*
1418 * Routine: lck_rw_lock_shared_to_exclusive_failure
1419 * Function:
1420 * assembly fast path code has already dropped our read
1421 * count and successfully acquired 'lck_rw_want_upgrade'
1422 * we just need to wait for the rest of the readers to drain
1423 * and then we can return as the exclusive holder of this lock
1424 */
1425boolean_t
1426lck_rw_lock_shared_to_exclusive_success(
1427 lck_rw_t *lck)
1428{
1429 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1430 uint64_t deadline = 0;
1431 int slept = 0;
1432 int still_shared = 0;
1433 wait_result_t res;
1434 boolean_t istate = -1;
1435
1436#if CONFIG_DTRACE
1437 uint64_t wait_interval = 0;
1438 int readers_at_sleep = 0;
1439 boolean_t dtrace_ls_initialized = FALSE;
1440 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1441#endif
1442
1443 while (lck->lck_rw_shared_count != 0) {
1444
1445#if CONFIG_DTRACE
1446 if (dtrace_ls_initialized == FALSE) {
1447 dtrace_ls_initialized = TRUE;
1448 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1449 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1450 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1451 if (dtrace_ls_enabled) {
1452 /*
1453 * Either sleeping or spinning is happening,
1454 * start a timing of our delay interval now.
1455 */
1456 readers_at_sleep = lck->lck_rw_shared_count;
1457 wait_interval = mach_absolute_time();
1458 }
1459 }
1460#endif
1461 if (istate == -1)
1462 istate = ml_get_interrupts_enabled();
1463
1464 deadline = lck_rw_deadline_for_spin(lck);
1465
1466 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1467 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1468
1469 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1470 lck_rw_lock_pause(istate);
1471
1472 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1473 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1474
1475 if ( !still_shared)
1476 break;
1477 /*
1478 * if we get here, the deadline has expired w/o
1479 * the rw_shared_count having drained to 0
1480 * check to see if we're allowed to do a thread_block
1481 */
1482 if (lck->lck_rw_can_sleep) {
1483
1484 istate = lck_interlock_lock(lck);
1485
1486 if (lck->lck_rw_shared_count != 0) {
1487 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1488 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1489
1490 lck->lck_w_waiting = TRUE;
1491
1492 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1493 lck_interlock_unlock(lck, istate);
1494
1495 if (res == THREAD_WAITING) {
1496 res = thread_block(THREAD_CONTINUE_NULL);
1497 slept++;
1498 }
1499 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1500 trace_lck, res, slept, 0, 0);
1501 } else {
1502 lck_interlock_unlock(lck, istate);
1503 break;
1504 }
1505 }
1506 }
1507#if CONFIG_DTRACE
1508 /*
1509 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1510 */
1511 if (dtrace_ls_enabled == TRUE) {
1512 if (slept == 0) {
1513 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1514 } else {
1515 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1516 mach_absolute_time() - wait_interval, 1,
1517 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1518 }
1519 }
1520 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1521#endif
1522 return (TRUE);
1523}
1524
1525
1526/*
1527 * Routine: lck_rw_lock_exclusive_to_shared
1528 * Function:
1529 * assembly fast path has already dropped
1530 * our exclusive state and bumped lck_rw_shared_count
1531 * all we need to do here is determine if anyone
1532 * needs to be awakened.
1533 */
1534void
1535lck_rw_lock_exclusive_to_shared_gen(
1536 lck_rw_t *lck,
1537 int prior_lock_state)
1538{
1539 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1540 lck_rw_t *fake_lck;
1541
1542 /*
1543 * prior_lock state is a snapshot of the 1st word of the
1544 * lock in question... we'll fake up a pointer to it
1545 * and carefully not access anything beyond whats defined
1546 * in the first word of a lck_rw_t
1547 */
1548 fake_lck = (lck_rw_t *)&prior_lock_state;
1549
1550 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1551 trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1552
1553 /*
1554 * don't wake up anyone waiting to take the lock exclusively
1555 * since we hold a read count... when the read count drops to 0,
1556 * the writers will be woken.
1557 *
1558 * wake up any waiting readers if we don't have any writers waiting,
1559 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1560 */
1561 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1562 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1563
1564 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1565 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1566
1567#if CONFIG_DTRACE
1568 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1569#endif
1570}
1571
1572
1573/*
1574 * Routine: lck_rw_try_lock
1575 */
1576boolean_t
1577lck_rw_try_lock(
1578 lck_rw_t *lck,
1579 lck_rw_type_t lck_rw_type)
1580{
1581 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1582 return(lck_rw_try_lock_shared(lck));
1583 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1584 return(lck_rw_try_lock_exclusive(lck));
1585 else
1586 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1587 return(FALSE);
1588}
1589
1590
1591void
1592lck_rw_assert(
1593 lck_rw_t *lck,
1594 unsigned int type)
1595{
1596 switch (type) {
1597 case LCK_RW_ASSERT_SHARED:
1598 if (lck->lck_rw_shared_count != 0) {
1599 return;
1600 }
1601 break;
1602 case LCK_RW_ASSERT_EXCLUSIVE:
1603 if ((lck->lck_rw_want_write ||
1604 lck->lck_rw_want_upgrade) &&
1605 lck->lck_rw_shared_count == 0) {
1606 return;
1607 }
1608 break;
1609 case LCK_RW_ASSERT_HELD:
1610 if (lck->lck_rw_want_write ||
1611 lck->lck_rw_want_upgrade ||
1612 lck->lck_rw_shared_count != 0) {
1613 return;
1614 }
1615 break;
1616 case LCK_RW_ASSERT_NOTHELD:
1617 if (!(lck->lck_rw_want_write ||
1618 lck->lck_rw_want_upgrade ||
1619 lck->lck_rw_shared_count != 0)) {
1620 return;
1621 }
1622 break;
1623 default:
1624 break;
1625 }
1626
1627 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1628}
1629
1630/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1631void
1632lck_rw_clear_promotions_x86(thread_t thread)
1633{
1634#if MACH_LDEBUG
1635 /* It's fatal to leave a RW lock locked and return to userspace */
1636 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1637#else
1638 /* Paper over the issue */
1639 thread->rwlock_count = 0;
1640 lck_rw_clear_promotion(thread);
1641#endif
1642}
1643
1644
1645/*
1646 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1647 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1648 */
1649boolean_t
1650kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
1651 if (not_in_kdp) {
1652 panic("panic: rw lock exclusive check done outside of kernel debugger");
1653 }
1654 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
1655}
1656
1657
1658#ifdef MUTEX_ZONE
1659extern zone_t lck_mtx_zone;
1660#endif
1661/*
1662 * Routine: lck_mtx_alloc_init
1663 */
1664lck_mtx_t *
1665lck_mtx_alloc_init(
1666 lck_grp_t *grp,
1667 lck_attr_t *attr)
1668{
1669 lck_mtx_t *lck;
1670#ifdef MUTEX_ZONE
1671 if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
1672 lck_mtx_init(lck, grp, attr);
1673#else
1674 if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
1675 lck_mtx_init(lck, grp, attr);
1676#endif
1677 return(lck);
1678}
1679
1680/*
1681 * Routine: lck_mtx_free
1682 */
1683void
1684lck_mtx_free(
1685 lck_mtx_t *lck,
1686 lck_grp_t *grp)
1687{
1688 lck_mtx_destroy(lck, grp);
1689#ifdef MUTEX_ZONE
1690 zfree(lck_mtx_zone, lck);
1691#else
1692 kfree(lck, sizeof(lck_mtx_t));
1693#endif
1694}
1695
1696/*
1697 * Routine: lck_mtx_ext_init
1698 */
1699static void
1700lck_mtx_ext_init(
1701 lck_mtx_ext_t *lck,
1702 lck_grp_t *grp,
1703 lck_attr_t *attr)
1704{
1705 bzero((void *)lck, sizeof(lck_mtx_ext_t));
1706
1707 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1708 lck->lck_mtx_deb.type = MUTEX_TAG;
1709 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
1710 }
1711
1712 lck->lck_mtx_grp = grp;
1713
1714 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
1715 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
1716
1717 lck->lck_mtx.lck_mtx_is_ext = 1;
1718 lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
1719}
1720
1721/*
1722 * Routine: lck_mtx_init
1723 */
1724void
1725lck_mtx_init(
1726 lck_mtx_t *lck,
1727 lck_grp_t *grp,
1728 lck_attr_t *attr)
1729{
1730 lck_mtx_ext_t *lck_ext;
1731 lck_attr_t *lck_attr;
1732
1733 if (attr != LCK_ATTR_NULL)
1734 lck_attr = attr;
1735 else
1736 lck_attr = &LockDefaultLckAttr;
1737
1738 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1739 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
1740 lck_mtx_ext_init(lck_ext, grp, lck_attr);
1741 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1742 lck->lck_mtx_ptr = lck_ext;
1743 }
1744 } else {
1745 lck->lck_mtx_owner = 0;
1746 lck->lck_mtx_state = 0;
1747 }
1748 lck->lck_mtx_pad32 = 0xFFFFFFFF;
1749 lck_grp_reference(grp);
1750 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1751}
1752
1753/*
1754 * Routine: lck_mtx_init_ext
1755 */
1756void
1757lck_mtx_init_ext(
1758 lck_mtx_t *lck,
1759 lck_mtx_ext_t *lck_ext,
1760 lck_grp_t *grp,
1761 lck_attr_t *attr)
1762{
1763 lck_attr_t *lck_attr;
1764
1765 if (attr != LCK_ATTR_NULL)
1766 lck_attr = attr;
1767 else
1768 lck_attr = &LockDefaultLckAttr;
1769
1770 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1771 lck_mtx_ext_init(lck_ext, grp, lck_attr);
1772 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1773 lck->lck_mtx_ptr = lck_ext;
1774 } else {
1775 lck->lck_mtx_owner = 0;
1776 lck->lck_mtx_state = 0;
1777 }
1778 lck->lck_mtx_pad32 = 0xFFFFFFFF;
1779
1780 lck_grp_reference(grp);
1781 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1782}
1783
1784/*
1785 * Routine: lck_mtx_destroy
1786 */
1787void
1788lck_mtx_destroy(
1789 lck_mtx_t *lck,
1790 lck_grp_t *grp)
1791{
1792 boolean_t lck_is_indirect;
1793
1794 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
1795 return;
1796#if MACH_LDEBUG
1797 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1798#endif
1799 lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1800
1801 lck_mtx_lock_mark_destroyed(lck);
1802
1803 if (lck_is_indirect)
1804 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
1805 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1806 lck_grp_deallocate(grp);
1807 return;
1808}
1809
1810
1811#define LCK_MTX_LCK_WAIT_CODE 0x20
1812#define LCK_MTX_LCK_WAKEUP_CODE 0x21
1813#define LCK_MTX_LCK_SPIN_CODE 0x22
1814#define LCK_MTX_LCK_ACQUIRE_CODE 0x23
1815#define LCK_MTX_LCK_DEMOTE_CODE 0x24
1816
1817
1818/*
1819 * Routine: lck_mtx_unlock_wakeup_x86
1820 *
1821 * Invoked on unlock when there is
1822 * contention (i.e. the assembly routine sees that
1823 * that mutex->lck_mtx_waiters != 0 or
1824 * that mutex->lck_mtx_promoted != 0...
1825 *
1826 * neither the mutex or interlock is held
1827 */
1828void
1829lck_mtx_unlock_wakeup_x86 (
1830 lck_mtx_t *mutex,
1831 int prior_lock_state)
1832{
1833 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1834 lck_mtx_t fake_lck;
1835
1836 /*
1837 * prior_lock state is a snapshot of the 2nd word of the
1838 * lock in question... we'll fake up a lock with the bits
1839 * copied into place and carefully not access anything
1840 * beyond whats defined in the second word of a lck_mtx_t
1841 */
1842 fake_lck.lck_mtx_state = prior_lock_state;
1843
1844 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
1845 trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
1846
1847 if (__probable(fake_lck.lck_mtx_waiters)) {
1848 if (fake_lck.lck_mtx_waiters > 1)
1849 thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
1850 else
1851 thread_wakeup_one(LCK_MTX_EVENT(mutex));
1852 }
1853
1854 if (__improbable(fake_lck.lck_mtx_promoted)) {
1855 thread_t thread = current_thread();
1856
1857
1858 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
1859 thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
1860
1861 if (thread->promotions > 0) {
1862 spl_t s = splsched();
1863
1864 thread_lock(thread);
1865
1866 if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
1867
1868 thread->sched_flags &= ~TH_SFLAG_PROMOTED;
1869
1870 if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
1871 /* Thread still has a RW lock promotion */
1872 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1873 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1874 thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
1875
1876 set_sched_pri(thread, DEPRESSPRI);
1877 }
1878 else {
1879 if (thread->base_pri < thread->sched_pri) {
1880 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1881 thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
1882
1883 thread_recompute_sched_pri(thread, FALSE);
1884 }
1885 }
1886 }
1887 thread_unlock(thread);
1888 splx(s);
1889 }
1890 }
1891 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
1892 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1893}
1894
1895
1896/*
1897 * Routine: lck_mtx_lock_acquire_x86
1898 *
1899 * Invoked on acquiring the mutex when there is
1900 * contention (i.e. the assembly routine sees that
1901 * that mutex->lck_mtx_waiters != 0 or
1902 * thread->was_promoted_on_wakeup != 0)...
1903 *
1904 * mutex is owned... interlock is held... preemption is disabled
1905 */
1906void
1907lck_mtx_lock_acquire_x86(
1908 lck_mtx_t *mutex)
1909{
1910 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1911 thread_t thread;
1912 integer_t priority;
1913 spl_t s;
1914
1915 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
1916 trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
1917
1918 if (mutex->lck_mtx_waiters)
1919 priority = mutex->lck_mtx_pri;
1920 else
1921 priority = 0;
1922
1923 thread = (thread_t)mutex->lck_mtx_owner; /* faster then current_thread() */
1924
1925 if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
1926
1927 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
1928 thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0);
1929
1930 s = splsched();
1931 thread_lock(thread);
1932
1933 if (thread->sched_pri < priority) {
1934 /* Do not promote past promotion ceiling */
1935 assert(priority <= MAXPRI_PROMOTE);
1936 set_sched_pri(thread, priority);
1937 }
1938 if (mutex->lck_mtx_promoted == 0) {
1939 mutex->lck_mtx_promoted = 1;
1940
1941 thread->promotions++;
1942 thread->sched_flags |= TH_SFLAG_PROMOTED;
1943 }
1944 thread->was_promoted_on_wakeup = 0;
1945
1946 thread_unlock(thread);
1947 splx(s);
1948 }
1949 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
1950 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1951}
1952
1953
1954static int
1955lck_mtx_interlock_try_lock(lck_mtx_t *mutex, boolean_t *istate)
1956{
1957 int retval;
1958
1959 *istate = ml_set_interrupts_enabled(FALSE);
1960 retval = lck_mtx_ilk_try_lock(mutex);
1961
1962 if (retval == 0)
1963 ml_set_interrupts_enabled(*istate);
1964
1965 return retval;
1966}
1967
1968static void
1969lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
1970{
1971 lck_mtx_ilk_unlock(mutex);
1972 ml_set_interrupts_enabled(istate);
1973}
1974
1975
1976/*
1977 * Routine: lck_mtx_lock_spinwait_x86
1978 *
1979 * Invoked trying to acquire a mutex when there is contention but
1980 * the holder is running on another processor. We spin for up to a maximum
1981 * time waiting for the lock to be released.
1982 *
1983 * Called with the interlock unlocked.
1984 * returns 0 if mutex acquired
1985 * returns 1 if we spun
1986 * returns 2 if we didn't spin due to the holder not running
1987 */
1988int
1989lck_mtx_lock_spinwait_x86(
1990 lck_mtx_t *mutex)
1991{
1992 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1993 thread_t holder;
1994 uint64_t overall_deadline;
1995 uint64_t check_owner_deadline;
1996 uint64_t cur_time;
1997 int retval = 1;
1998 int loopcount = 0;
1999
2000 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2001 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
2002
2003 cur_time = mach_absolute_time();
2004 overall_deadline = cur_time + MutexSpin;
2005 check_owner_deadline = cur_time;
2006
2007 /*
2008 * Spin while:
2009 * - mutex is locked, and
2010 * - its locked as a spin lock, and
2011 * - owner is running on another processor, and
2012 * - owner (processor) is not idling, and
2013 * - we haven't spun for long enough.
2014 */
2015 do {
2016 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
2017 retval = 0;
2018 break;
2019 }
2020 cur_time = mach_absolute_time();
2021
2022 if (cur_time >= overall_deadline)
2023 break;
2024
2025 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
2026 boolean_t istate;
2027
2028 if (lck_mtx_interlock_try_lock(mutex, &istate)) {
2029
2030 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
2031
2032 if ( !(holder->machine.specFlags & OnProc) ||
2033 (holder->state & TH_IDLE)) {
2034
2035 lck_mtx_interlock_unlock(mutex, istate);
2036
2037 if (loopcount == 0)
2038 retval = 2;
2039 break;
2040 }
2041 }
2042 lck_mtx_interlock_unlock(mutex, istate);
2043
2044 check_owner_deadline = cur_time + (MutexSpin / 4);
2045 }
2046 }
2047 cpu_pause();
2048
2049 loopcount++;
2050
2051 } while (TRUE);
2052
2053#if CONFIG_DTRACE
2054 /*
2055 * We've already kept a count via overall_deadline of how long we spun.
2056 * If dtrace is active, then we compute backwards to decide how
2057 * long we spun.
2058 *
2059 * Note that we record a different probe id depending on whether
2060 * this is a direct or indirect mutex. This allows us to
2061 * penalize only lock groups that have debug/stats enabled
2062 * with dtrace processing if desired.
2063 */
2064 if (__probable(mutex->lck_mtx_is_ext == 0)) {
2065 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
2066 mach_absolute_time() - (overall_deadline - MutexSpin));
2067 } else {
2068 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
2069 mach_absolute_time() - (overall_deadline - MutexSpin));
2070 }
2071 /* The lockstat acquire event is recorded by the assembly code beneath us. */
2072#endif
2073
2074 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2075 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
2076
2077 return retval;
2078}
2079
2080
2081
2082/*
2083 * Routine: lck_mtx_lock_wait_x86
2084 *
2085 * Invoked in order to wait on contention.
2086 *
2087 * Called with the interlock locked and
2088 * preemption disabled...
2089 * returns it unlocked and with preemption enabled
2090 */
2091void
2092lck_mtx_lock_wait_x86 (
2093 lck_mtx_t *mutex)
2094{
2095 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2096 thread_t self = current_thread();
2097 thread_t holder;
2098 integer_t priority;
2099 spl_t s;
2100#if CONFIG_DTRACE
2101 uint64_t sleep_start = 0;
2102
2103 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
2104 sleep_start = mach_absolute_time();
2105 }
2106#endif
2107 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
2108 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2109
2110 priority = self->sched_pri;
2111
2112 if (priority < self->base_pri)
2113 priority = self->base_pri;
2114 if (priority < BASEPRI_DEFAULT)
2115 priority = BASEPRI_DEFAULT;
2116
2117 /* Do not promote past promotion ceiling */
2118 priority = MIN(priority, MAXPRI_PROMOTE);
2119
2120 if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
2121 mutex->lck_mtx_pri = priority;
2122 mutex->lck_mtx_waiters++;
2123
2124 if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
2125 holder->sched_pri < mutex->lck_mtx_pri ) {
2126 s = splsched();
2127 thread_lock(holder);
2128
2129 /* holder priority may have been bumped by another thread
2130 * before thread_lock was taken
2131 */
2132 if (holder->sched_pri < mutex->lck_mtx_pri) {
2133 KERNEL_DEBUG_CONSTANT(
2134 MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2135 holder->sched_pri, priority, thread_tid(holder), trace_lck, 0);
2136 /* Assert that we're not altering the priority of a
2137 * thread above the MAXPRI_PROMOTE band
2138 */
2139 assert(holder->sched_pri < MAXPRI_PROMOTE);
2140 set_sched_pri(holder, priority);
2141
2142 if (mutex->lck_mtx_promoted == 0) {
2143 holder->promotions++;
2144 holder->sched_flags |= TH_SFLAG_PROMOTED;
2145
2146 mutex->lck_mtx_promoted = 1;
2147 }
2148 }
2149 thread_unlock(holder);
2150 splx(s);
2151 }
2152 assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
2153
2154 lck_mtx_ilk_unlock(mutex);
2155
2156 thread_block(THREAD_CONTINUE_NULL);
2157
2158 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2159 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2160
2161#if CONFIG_DTRACE
2162 /*
2163 * Record the Dtrace lockstat probe for blocking, block time
2164 * measured from when we were entered.
2165 */
2166 if (sleep_start) {
2167 if (mutex->lck_mtx_is_ext == 0) {
2168 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2169 mach_absolute_time() - sleep_start);
2170 } else {
2171 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2172 mach_absolute_time() - sleep_start);
2173 }
2174 }
2175#endif
2176}
2177
2178/*
2179 * Routine: kdp_lck_mtx_lock_spin_is_acquired
2180 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2181 * Returns: TRUE if lock is acquired.
2182 */
2183boolean_t
2184kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
2185{
2186 if (not_in_kdp) {
2187 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
2188 }
2189
2190 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
2191 return TRUE;
2192 }
2193
2194 return FALSE;
2195}
2196