]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/locks_i386.c
xnu-4903.231.4.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
64 #define ATOMIC_PRIVATE 1
65 #define LOCK_PRIVATE 1
66
67 #include <mach_ldebug.h>
68
69 #include <kern/locks.h>
70 #include <kern/kalloc.h>
71 #include <kern/misc_protos.h>
72 #include <kern/thread.h>
73 #include <kern/processor.h>
74 #include <kern/cpu_data.h>
75 #include <kern/cpu_number.h>
76 #include <kern/sched_prim.h>
77 #include <kern/xpr.h>
78 #include <kern/debug.h>
79 #include <string.h>
80
81 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
82 #include <machine/atomic.h>
83 #include <machine/machine_cpu.h>
84 #include <i386/mp.h>
85 #include <machine/atomic.h>
86 #include <sys/kdebug.h>
87 #include <i386/locks_i386_inlines.h>
88
89 /*
90 * We need only enough declarations from the BSD-side to be able to
91 * test if our probe is active, and to call __dtrace_probe(). Setting
92 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
93 */
94 #if CONFIG_DTRACE
95 #define NEED_DTRACE_DEFS
96 #include <../bsd/sys/lockstat.h>
97
98 #define DTRACE_RW_SHARED 0x0 //reader
99 #define DTRACE_RW_EXCL 0x1 //writer
100 #define DTRACE_NO_FLAG 0x0 //not applicable
101
102 #endif
103
104 #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
105 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
106 #define LCK_RW_LCK_SHARED_CODE 0x102
107 #define LCK_RW_LCK_SH_TO_EX_CODE 0x103
108 #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
109 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105
110
111 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
112 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
113 #define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
114 #define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
115 #define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
116 #define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
117 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
118 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
119
120
121 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
122
123 unsigned int LcksOpts=0;
124
125 #if DEVELOPMENT || DEBUG
126 unsigned int LckDisablePreemptCheck = 0;
127 #endif
128
129 /* Forwards */
130
131 #if USLOCK_DEBUG
132 /*
133 * Perform simple lock checks.
134 */
135 int uslock_check = 1;
136 int max_lock_loops = 100000000;
137 decl_simple_lock_data(extern , printf_lock)
138 decl_simple_lock_data(extern , panic_lock)
139 #endif /* USLOCK_DEBUG */
140
141 extern unsigned int not_in_kdp;
142
143 /*
144 * We often want to know the addresses of the callers
145 * of the various lock routines. However, this information
146 * is only used for debugging and statistics.
147 */
148 typedef void *pc_t;
149 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
150 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
151 #if ANY_LOCK_DEBUG
152 #define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
153 #define DECL_PC(pc) pc_t pc;
154 #else /* ANY_LOCK_DEBUG */
155 #define DECL_PC(pc)
156 #ifdef lint
157 /*
158 * Eliminate lint complaints about unused local pc variables.
159 */
160 #define OBTAIN_PC(pc) ++pc
161 #else /* lint */
162 #define OBTAIN_PC(pc)
163 #endif /* lint */
164 #endif /* USLOCK_DEBUG */
165
166 /*
167 * atomic exchange API is a low level abstraction of the operations
168 * to atomically read, modify, and write a pointer. This abstraction works
169 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
170 * well as the ARM exclusive instructions.
171 *
172 * atomic_exchange_begin() - begin exchange and retrieve current value
173 * atomic_exchange_complete() - conclude an exchange
174 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
175 */
176 static uint32_t
177 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
178 {
179 uint32_t val;
180
181 (void)ord; // Memory order not used
182 val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed);
183 *previous = val;
184 return val;
185 }
186
187 static boolean_t
188 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
189 {
190 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
191 }
192
193 static void
194 atomic_exchange_abort(void) { }
195
196 static boolean_t
197 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
198 {
199 uint32_t value, prev;
200
201 for ( ; ; ) {
202 value = atomic_exchange_begin32(target, &prev, ord);
203 if (value & test_mask) {
204 if (wait)
205 cpu_pause();
206 else
207 atomic_exchange_abort();
208 return FALSE;
209 }
210 value |= set_mask;
211 if (atomic_exchange_complete32(target, prev, value, ord))
212 return TRUE;
213 }
214 }
215
216 /*
217 * Portable lock package implementation of usimple_locks.
218 */
219
220 #if USLOCK_DEBUG
221 #define USLDBG(stmt) stmt
222 void usld_lock_init(usimple_lock_t, unsigned short);
223 void usld_lock_pre(usimple_lock_t, pc_t);
224 void usld_lock_post(usimple_lock_t, pc_t);
225 void usld_unlock(usimple_lock_t, pc_t);
226 void usld_lock_try_pre(usimple_lock_t, pc_t);
227 void usld_lock_try_post(usimple_lock_t, pc_t);
228 int usld_lock_common_checks(usimple_lock_t, char *);
229 #else /* USLOCK_DEBUG */
230 #define USLDBG(stmt)
231 #endif /* USLOCK_DEBUG */
232
233 /*
234 * Forward definitions
235 */
236
237 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
238 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
239 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
240 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
241 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
242 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
243 void lck_rw_clear_promotions_x86(thread_t thread);
244 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
245 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
246 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
247 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, int prior_lock_state, boolean_t indirect);
248 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
249 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
250 static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state);
251 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
252 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
253 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
254
255
256 /*
257 * Routine: lck_spin_alloc_init
258 */
259 lck_spin_t *
260 lck_spin_alloc_init(
261 lck_grp_t *grp,
262 lck_attr_t *attr)
263 {
264 lck_spin_t *lck;
265
266 if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
267 lck_spin_init(lck, grp, attr);
268
269 return(lck);
270 }
271
272 /*
273 * Routine: lck_spin_free
274 */
275 void
276 lck_spin_free(
277 lck_spin_t *lck,
278 lck_grp_t *grp)
279 {
280 lck_spin_destroy(lck, grp);
281 kfree(lck, sizeof(lck_spin_t));
282 }
283
284 /*
285 * Routine: lck_spin_init
286 */
287 void
288 lck_spin_init(
289 lck_spin_t *lck,
290 lck_grp_t *grp,
291 __unused lck_attr_t *attr)
292 {
293 usimple_lock_init((usimple_lock_t) lck, 0);
294 lck_grp_reference(grp);
295 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
296 }
297
298 /*
299 * Routine: lck_spin_destroy
300 */
301 void
302 lck_spin_destroy(
303 lck_spin_t *lck,
304 lck_grp_t *grp)
305 {
306 if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
307 return;
308 lck->interlock = LCK_SPIN_TAG_DESTROYED;
309 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
310 lck_grp_deallocate(grp);
311 return;
312 }
313
314 /*
315 * Routine: lck_spin_lock
316 */
317 void
318 lck_spin_lock(
319 lck_spin_t *lck)
320 {
321 usimple_lock((usimple_lock_t) lck);
322 }
323
324 /*
325 * Routine: lck_spin_unlock
326 */
327 void
328 lck_spin_unlock(
329 lck_spin_t *lck)
330 {
331 usimple_unlock((usimple_lock_t) lck);
332 }
333
334
335 /*
336 * Routine: lck_spin_try_lock
337 */
338 boolean_t
339 lck_spin_try_lock(
340 lck_spin_t *lck)
341 {
342 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
343 #if DEVELOPMENT || DEBUG
344 if (lrval) {
345 pltrace(FALSE);
346 }
347 #endif
348 return(lrval);
349 }
350
351 /*
352 * Routine: lck_spin_assert
353 */
354 void
355 lck_spin_assert(lck_spin_t *lock, unsigned int type)
356 {
357 thread_t thread, holder;
358 uintptr_t state;
359
360 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
361 panic("lck_spin_assert(): invalid arg (%u)", type);
362 }
363
364 state = lock->interlock;
365 holder = (thread_t)state;
366 thread = current_thread();
367 if (type == LCK_ASSERT_OWNED) {
368 if (__improbable(holder == THREAD_NULL)) {
369 panic("Lock not owned %p = %lx", lock, state);
370 }
371 if (__improbable(holder != thread)) {
372 panic("Lock not owned by current thread %p = %lx", lock, state);
373 }
374 } else if (type == LCK_ASSERT_NOTOWNED) {
375 if (__improbable(holder != THREAD_NULL)) {
376 if (holder == thread) {
377 panic("Lock owned by current thread %p = %lx", lock, state);
378 } else {
379 panic("Lock %p owned by thread %p", lock, holder);
380 }
381 }
382 }
383 }
384
385 /*
386 * Routine: kdp_lck_spin_is_acquired
387 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
388 * Returns: TRUE if lock is acquired.
389 */
390 boolean_t
391 kdp_lck_spin_is_acquired(lck_spin_t *lck) {
392 if (not_in_kdp) {
393 panic("panic: spinlock acquired check done outside of kernel debugger");
394 }
395 return (lck->interlock != 0)? TRUE : FALSE;
396 }
397
398 /*
399 * Initialize a usimple_lock.
400 *
401 * No change in preemption state.
402 */
403 void
404 usimple_lock_init(
405 usimple_lock_t l,
406 __unused unsigned short tag)
407 {
408 #ifndef MACHINE_SIMPLE_LOCK
409 USLDBG(usld_lock_init(l, tag));
410 hw_lock_init(&l->interlock);
411 #else
412 simple_lock_init((simple_lock_t)l,tag);
413 #endif
414 }
415
416 volatile uint32_t spinlock_owner_cpu = ~0;
417 volatile usimple_lock_t spinlock_timed_out;
418
419 uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
420 uint32_t i;
421
422 for (i = 0; i < real_ncpus; i++) {
423 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
424 spinlock_owner_cpu = i;
425 if ((uint32_t) cpu_number() != i) {
426 /* Cause NMI and panic on the owner's cpu */
427 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
428 }
429 break;
430 }
431 }
432
433 return spinlock_owner_cpu;
434 }
435
436 /*
437 * Acquire a usimple_lock.
438 *
439 * Returns with preemption disabled. Note
440 * that the hw_lock routines are responsible for
441 * maintaining preemption state.
442 */
443 void
444 usimple_lock(
445 usimple_lock_t l)
446 {
447 #ifndef MACHINE_SIMPLE_LOCK
448 DECL_PC(pc);
449
450 OBTAIN_PC(pc);
451 USLDBG(usld_lock_pre(l, pc));
452
453 if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0)) {
454 boolean_t uslock_acquired = FALSE;
455 while (machine_timeout_suspended()) {
456 enable_preemption();
457 if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
458 break;
459 }
460
461 if (uslock_acquired == FALSE) {
462 uint32_t lock_cpu;
463 uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
464 spinlock_timed_out = l;
465 lock_cpu = spinlock_timeout_NMI(lowner);
466 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
467 l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
468 }
469 }
470 #if DEVELOPMENT || DEBUG
471 pltrace(FALSE);
472 #endif
473
474 USLDBG(usld_lock_post(l, pc));
475 #else
476 simple_lock((simple_lock_t)l);
477 #endif
478 #if CONFIG_DTRACE
479 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0);
480 #endif
481 }
482
483
484 /*
485 * Release a usimple_lock.
486 *
487 * Returns with preemption enabled. Note
488 * that the hw_lock routines are responsible for
489 * maintaining preemption state.
490 */
491 void
492 usimple_unlock(
493 usimple_lock_t l)
494 {
495 #ifndef MACHINE_SIMPLE_LOCK
496 DECL_PC(pc);
497
498 OBTAIN_PC(pc);
499 USLDBG(usld_unlock(l, pc));
500 #if DEVELOPMENT || DEBUG
501 pltrace(TRUE);
502 #endif
503 hw_lock_unlock(&l->interlock);
504 #else
505 simple_unlock_rwmb((simple_lock_t)l);
506 #endif
507 }
508
509
510 /*
511 * Conditionally acquire a usimple_lock.
512 *
513 * On success, returns with preemption disabled.
514 * On failure, returns with preemption in the same state
515 * as when first invoked. Note that the hw_lock routines
516 * are responsible for maintaining preemption state.
517 *
518 * XXX No stats are gathered on a miss; I preserved this
519 * behavior from the original assembly-language code, but
520 * doesn't it make sense to log misses? XXX
521 */
522 unsigned int
523 usimple_lock_try(
524 usimple_lock_t l)
525 {
526 #ifndef MACHINE_SIMPLE_LOCK
527 unsigned int success;
528 DECL_PC(pc);
529
530 OBTAIN_PC(pc);
531 USLDBG(usld_lock_try_pre(l, pc));
532 if ((success = hw_lock_try(&l->interlock))) {
533 #if DEVELOPMENT || DEBUG
534 pltrace(FALSE);
535 #endif
536 USLDBG(usld_lock_try_post(l, pc));
537 }
538 return success;
539 #else
540 return(simple_lock_try((simple_lock_t)l));
541 #endif
542 }
543
544 /*
545 * Acquire a usimple_lock while polling for pending TLB flushes
546 * and spinning on a lock.
547 *
548 */
549 void
550 usimple_lock_try_lock_loop(usimple_lock_t l)
551 {
552 boolean_t istate = ml_get_interrupts_enabled();
553 while (!simple_lock_try((l))) {
554 if (!istate)
555 handle_pending_TLB_flushes();
556 cpu_pause();
557 }
558 }
559
560 #if USLOCK_DEBUG
561 /*
562 * States of a usimple_lock. The default when initializing
563 * a usimple_lock is setting it up for debug checking.
564 */
565 #define USLOCK_CHECKED 0x0001 /* lock is being checked */
566 #define USLOCK_TAKEN 0x0002 /* lock has been taken */
567 #define USLOCK_INIT 0xBAA0 /* lock has been initialized */
568 #define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
569 #define USLOCK_CHECKING(l) (uslock_check && \
570 ((l)->debug.state & USLOCK_CHECKED))
571
572 /*
573 * Trace activities of a particularly interesting lock.
574 */
575 void usl_trace(usimple_lock_t, int, pc_t, const char *);
576
577
578 /*
579 * Initialize the debugging information contained
580 * in a usimple_lock.
581 */
582 void
583 usld_lock_init(
584 usimple_lock_t l,
585 __unused unsigned short tag)
586 {
587 if (l == USIMPLE_LOCK_NULL)
588 panic("lock initialization: null lock pointer");
589 l->lock_type = USLOCK_TAG;
590 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
591 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
592 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
593 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
594 l->debug.duration[0] = l->debug.duration[1] = 0;
595 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
596 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
597 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
598 }
599
600
601 /*
602 * These checks apply to all usimple_locks, not just
603 * those with USLOCK_CHECKED turned on.
604 */
605 int
606 usld_lock_common_checks(
607 usimple_lock_t l,
608 char *caller)
609 {
610 if (l == USIMPLE_LOCK_NULL)
611 panic("%s: null lock pointer", caller);
612 if (l->lock_type != USLOCK_TAG)
613 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
614 if (!(l->debug.state & USLOCK_INIT))
615 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
616 return USLOCK_CHECKING(l);
617 }
618
619
620 /*
621 * Debug checks on a usimple_lock just before attempting
622 * to acquire it.
623 */
624 /* ARGSUSED */
625 void
626 usld_lock_pre(
627 usimple_lock_t l,
628 pc_t pc)
629 {
630 char caller[] = "usimple_lock";
631
632
633 if (!usld_lock_common_checks(l, caller))
634 return;
635
636 /*
637 * Note that we have a weird case where we are getting a lock when we are]
638 * in the process of putting the system to sleep. We are running with no
639 * current threads, therefore we can't tell if we are trying to retake a lock
640 * we have or someone on the other processor has it. Therefore we just
641 * ignore this test if the locking thread is 0.
642 */
643
644 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
645 l->debug.lock_thread == (void *) current_thread()) {
646 printf("%s: lock %p already locked (at %p) by",
647 caller, l, l->debug.lock_pc);
648 printf(" current thread %p (new attempt at pc %p)\n",
649 l->debug.lock_thread, pc);
650 panic("%s", caller);
651 }
652 mp_disable_preemption();
653 usl_trace(l, cpu_number(), pc, caller);
654 mp_enable_preemption();
655 }
656
657
658 /*
659 * Debug checks on a usimple_lock just after acquiring it.
660 *
661 * Pre-emption has been disabled at this point,
662 * so we are safe in using cpu_number.
663 */
664 void
665 usld_lock_post(
666 usimple_lock_t l,
667 pc_t pc)
668 {
669 int mycpu;
670 char caller[] = "successful usimple_lock";
671
672
673 if (!usld_lock_common_checks(l, caller))
674 return;
675
676 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
677 panic("%s: lock %p became uninitialized",
678 caller, l);
679 if ((l->debug.state & USLOCK_TAKEN))
680 panic("%s: lock 0x%p became TAKEN by someone else",
681 caller, l);
682
683 mycpu = cpu_number();
684 l->debug.lock_thread = (void *)current_thread();
685 l->debug.state |= USLOCK_TAKEN;
686 l->debug.lock_pc = pc;
687 l->debug.lock_cpu = mycpu;
688
689 usl_trace(l, mycpu, pc, caller);
690 }
691
692
693 /*
694 * Debug checks on a usimple_lock just before
695 * releasing it. Note that the caller has not
696 * yet released the hardware lock.
697 *
698 * Preemption is still disabled, so there's
699 * no problem using cpu_number.
700 */
701 void
702 usld_unlock(
703 usimple_lock_t l,
704 pc_t pc)
705 {
706 int mycpu;
707 char caller[] = "usimple_unlock";
708
709
710 if (!usld_lock_common_checks(l, caller))
711 return;
712
713 mycpu = cpu_number();
714
715 if (!(l->debug.state & USLOCK_TAKEN))
716 panic("%s: lock 0x%p hasn't been taken",
717 caller, l);
718 if (l->debug.lock_thread != (void *) current_thread())
719 panic("%s: unlocking lock 0x%p, owned by thread %p",
720 caller, l, l->debug.lock_thread);
721 if (l->debug.lock_cpu != mycpu) {
722 printf("%s: unlocking lock 0x%p on cpu 0x%x",
723 caller, l, mycpu);
724 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
725 panic("%s", caller);
726 }
727 usl_trace(l, mycpu, pc, caller);
728
729 l->debug.unlock_thread = l->debug.lock_thread;
730 l->debug.lock_thread = INVALID_PC;
731 l->debug.state &= ~USLOCK_TAKEN;
732 l->debug.unlock_pc = pc;
733 l->debug.unlock_cpu = mycpu;
734 }
735
736
737 /*
738 * Debug checks on a usimple_lock just before
739 * attempting to acquire it.
740 *
741 * Preemption isn't guaranteed to be disabled.
742 */
743 void
744 usld_lock_try_pre(
745 usimple_lock_t l,
746 pc_t pc)
747 {
748 char caller[] = "usimple_lock_try";
749
750 if (!usld_lock_common_checks(l, caller))
751 return;
752 mp_disable_preemption();
753 usl_trace(l, cpu_number(), pc, caller);
754 mp_enable_preemption();
755 }
756
757
758 /*
759 * Debug checks on a usimple_lock just after
760 * successfully attempting to acquire it.
761 *
762 * Preemption has been disabled by the
763 * lock acquisition attempt, so it's safe
764 * to use cpu_number.
765 */
766 void
767 usld_lock_try_post(
768 usimple_lock_t l,
769 pc_t pc)
770 {
771 int mycpu;
772 char caller[] = "successful usimple_lock_try";
773
774 if (!usld_lock_common_checks(l, caller))
775 return;
776
777 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
778 panic("%s: lock 0x%p became uninitialized",
779 caller, l);
780 if ((l->debug.state & USLOCK_TAKEN))
781 panic("%s: lock 0x%p became TAKEN by someone else",
782 caller, l);
783
784 mycpu = cpu_number();
785 l->debug.lock_thread = (void *) current_thread();
786 l->debug.state |= USLOCK_TAKEN;
787 l->debug.lock_pc = pc;
788 l->debug.lock_cpu = mycpu;
789
790 usl_trace(l, mycpu, pc, caller);
791 }
792
793
794 /*
795 * For very special cases, set traced_lock to point to a
796 * specific lock of interest. The result is a series of
797 * XPRs showing lock operations on that lock. The lock_seq
798 * value is used to show the order of those operations.
799 */
800 usimple_lock_t traced_lock;
801 unsigned int lock_seq;
802
803 void
804 usl_trace(
805 usimple_lock_t l,
806 int mycpu,
807 pc_t pc,
808 const char * op_name)
809 {
810 if (traced_lock == l) {
811 XPR(XPR_SLOCK,
812 "seq %d, cpu %d, %s @ %x\n",
813 (uintptr_t) lock_seq, (uintptr_t) mycpu,
814 (uintptr_t) op_name, (uintptr_t) pc, 0);
815 lock_seq++;
816 }
817 }
818
819
820 #endif /* USLOCK_DEBUG */
821
822 /*
823 * Routine: lck_rw_alloc_init
824 */
825 lck_rw_t *
826 lck_rw_alloc_init(
827 lck_grp_t *grp,
828 lck_attr_t *attr) {
829 lck_rw_t *lck;
830
831 if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
832 bzero(lck, sizeof(lck_rw_t));
833 lck_rw_init(lck, grp, attr);
834 }
835
836 return(lck);
837 }
838
839 /*
840 * Routine: lck_rw_free
841 */
842 void
843 lck_rw_free(
844 lck_rw_t *lck,
845 lck_grp_t *grp) {
846 lck_rw_destroy(lck, grp);
847 kfree(lck, sizeof(lck_rw_t));
848 }
849
850 /*
851 * Routine: lck_rw_init
852 */
853 void
854 lck_rw_init(
855 lck_rw_t *lck,
856 lck_grp_t *grp,
857 lck_attr_t *attr)
858 {
859 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
860 attr : &LockDefaultLckAttr;
861
862 hw_lock_byte_init(&lck->lck_rw_interlock);
863 lck->lck_rw_want_write = FALSE;
864 lck->lck_rw_want_upgrade = FALSE;
865 lck->lck_rw_shared_count = 0;
866 lck->lck_rw_can_sleep = TRUE;
867 lck->lck_r_waiting = lck->lck_w_waiting = 0;
868 lck->lck_rw_tag = 0;
869 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
870 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
871
872 lck_grp_reference(grp);
873 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
874 }
875
876 /*
877 * Routine: lck_rw_destroy
878 */
879 void
880 lck_rw_destroy(
881 lck_rw_t *lck,
882 lck_grp_t *grp)
883 {
884 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
885 return;
886 #if MACH_LDEBUG
887 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
888 #endif
889 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
890 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
891 lck_grp_deallocate(grp);
892 return;
893 }
894
895 /*
896 * Sleep locks. These use the same data structure and algorithm
897 * as the spin locks, but the process sleeps while it is waiting
898 * for the lock. These work on uniprocessor systems.
899 */
900
901 #define DECREMENTER_TIMEOUT 1000000
902
903 /*
904 * We disable interrupts while holding the RW interlock to prevent an
905 * interrupt from exacerbating hold time.
906 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
907 */
908 static inline boolean_t
909 lck_interlock_lock(lck_rw_t *lck)
910 {
911 boolean_t istate;
912
913 istate = ml_set_interrupts_enabled(FALSE);
914 hw_lock_byte_lock(&lck->lck_rw_interlock);
915 return istate;
916 }
917
918 static inline void
919 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
920 {
921 hw_lock_byte_unlock(&lck->lck_rw_interlock);
922 ml_set_interrupts_enabled(istate);
923 }
924
925 /*
926 * This inline is used when busy-waiting for an rw lock.
927 * If interrupts were disabled when the lock primitive was called,
928 * we poll the IPI handler for pending tlb flushes.
929 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
930 */
931 static inline void
932 lck_rw_lock_pause(boolean_t interrupts_enabled)
933 {
934 if (!interrupts_enabled)
935 handle_pending_TLB_flushes();
936 cpu_pause();
937 }
938
939 static inline boolean_t
940 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
941 {
942 if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE))
943 return TRUE;
944 return FALSE;
945 }
946
947 /*
948 * compute the deadline to spin against when
949 * waiting for a change of state on a lck_rw_t
950 */
951 static inline uint64_t
952 lck_rw_deadline_for_spin(lck_rw_t *lck)
953 {
954 if (lck->lck_rw_can_sleep) {
955 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
956 /*
957 * there are already threads waiting on this lock... this
958 * implies that they have spun beyond their deadlines waiting for
959 * the desired state to show up so we will not bother spinning at this time...
960 * or
961 * the current number of threads sharing this lock exceeds our capacity to run them
962 * concurrently and since all states we're going to spin for require the rw_shared_count
963 * to be at 0, we'll not bother spinning since the latency for this to happen is
964 * unpredictable...
965 */
966 return (mach_absolute_time());
967 }
968 return (mach_absolute_time() + MutexSpin);
969 } else
970 return (mach_absolute_time() + (100000LL * 1000000000LL));
971 }
972
973
974 /*
975 * Spin while interlock is held.
976 */
977
978 static inline void
979 lck_rw_interlock_spin(lck_rw_t *lock)
980 {
981 while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
982 cpu_pause();
983 }
984 }
985
986 static boolean_t
987 lck_rw_grab_want(lck_rw_t *lock)
988 {
989 uint32_t data, prev;
990
991 for ( ; ; ) {
992 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
993 if ((data & LCK_RW_INTERLOCK) == 0)
994 break;
995 atomic_exchange_abort();
996 lck_rw_interlock_spin(lock);
997 }
998 if (data & LCK_RW_WANT_WRITE) {
999 atomic_exchange_abort();
1000 return FALSE;
1001 }
1002 data |= LCK_RW_WANT_WRITE;
1003 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1004 }
1005
1006 static boolean_t
1007 lck_rw_grab_shared(lck_rw_t *lock)
1008 {
1009 uint32_t data, prev;
1010
1011 for ( ; ; ) {
1012 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1013 if ((data & LCK_RW_INTERLOCK) == 0)
1014 break;
1015 atomic_exchange_abort();
1016 lck_rw_interlock_spin(lock);
1017 }
1018 if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1019 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1020 atomic_exchange_abort();
1021 return FALSE;
1022 }
1023 }
1024 data += LCK_RW_SHARED_READER;
1025 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1026 }
1027
1028 /*
1029 * Routine: lck_rw_lock_exclusive
1030 */
1031 static void
1032 lck_rw_lock_exclusive_gen(
1033 lck_rw_t *lck)
1034 {
1035 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1036 uint64_t deadline = 0;
1037 int slept = 0;
1038 int gotlock = 0;
1039 int lockheld = 0;
1040 wait_result_t res = 0;
1041 boolean_t istate = -1;
1042
1043 #if CONFIG_DTRACE
1044 boolean_t dtrace_ls_initialized = FALSE;
1045 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
1046 uint64_t wait_interval = 0;
1047 int readers_at_sleep = 0;
1048 #endif
1049
1050 /*
1051 * Try to acquire the lck_rw_want_write bit.
1052 */
1053 while ( !lck_rw_grab_want(lck)) {
1054
1055 #if CONFIG_DTRACE
1056 if (dtrace_ls_initialized == FALSE) {
1057 dtrace_ls_initialized = TRUE;
1058 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1059 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1060 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1061 if (dtrace_ls_enabled) {
1062 /*
1063 * Either sleeping or spinning is happening,
1064 * start a timing of our delay interval now.
1065 */
1066 readers_at_sleep = lck->lck_rw_shared_count;
1067 wait_interval = mach_absolute_time();
1068 }
1069 }
1070 #endif
1071 if (istate == -1)
1072 istate = ml_get_interrupts_enabled();
1073
1074 deadline = lck_rw_deadline_for_spin(lck);
1075
1076 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1077
1078 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
1079 lck_rw_lock_pause(istate);
1080
1081 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1082
1083 if (gotlock)
1084 break;
1085 /*
1086 * if we get here, the deadline has expired w/o us
1087 * being able to grab the lock exclusively
1088 * check to see if we're allowed to do a thread_block
1089 */
1090 if (lck->lck_rw_can_sleep) {
1091
1092 istate = lck_interlock_lock(lck);
1093
1094 if (lck->lck_rw_want_write) {
1095
1096 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1097
1098 lck->lck_w_waiting = TRUE;
1099
1100 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1101 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1102 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1103 lck_interlock_unlock(lck, istate);
1104
1105 if (res == THREAD_WAITING) {
1106 res = thread_block(THREAD_CONTINUE_NULL);
1107 slept++;
1108 }
1109 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1110 } else {
1111 lck->lck_rw_want_write = TRUE;
1112 lck_interlock_unlock(lck, istate);
1113 break;
1114 }
1115 }
1116 }
1117 /*
1118 * Wait for readers (and upgrades) to finish...
1119 * the test for these conditions must be done simultaneously with
1120 * a check of the interlock not being held since
1121 * the rw_shared_count will drop to 0 first and then want_upgrade
1122 * will be set to 1 in the shared_to_exclusive scenario... those
1123 * adjustments are done behind the interlock and represent an
1124 * atomic change in state and must be considered as such
1125 * however, once we see the read count at 0, the want_upgrade not set
1126 * and the interlock not held, we are safe to proceed
1127 */
1128 while (lck_rw_held_read_or_upgrade(lck)) {
1129
1130 #if CONFIG_DTRACE
1131 /*
1132 * Either sleeping or spinning is happening, start
1133 * a timing of our delay interval now. If we set it
1134 * to -1 we don't have accurate data so we cannot later
1135 * decide to record a dtrace spin or sleep event.
1136 */
1137 if (dtrace_ls_initialized == FALSE) {
1138 dtrace_ls_initialized = TRUE;
1139 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1140 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1141 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1142 if (dtrace_ls_enabled) {
1143 /*
1144 * Either sleeping or spinning is happening,
1145 * start a timing of our delay interval now.
1146 */
1147 readers_at_sleep = lck->lck_rw_shared_count;
1148 wait_interval = mach_absolute_time();
1149 }
1150 }
1151 #endif
1152 if (istate == -1)
1153 istate = ml_get_interrupts_enabled();
1154
1155 deadline = lck_rw_deadline_for_spin(lck);
1156
1157 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1158
1159 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1160 lck_rw_lock_pause(istate);
1161
1162 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1163
1164 if ( !lockheld)
1165 break;
1166 /*
1167 * if we get here, the deadline has expired w/o us
1168 * being able to grab the lock exclusively
1169 * check to see if we're allowed to do a thread_block
1170 */
1171 if (lck->lck_rw_can_sleep) {
1172
1173 istate = lck_interlock_lock(lck);
1174
1175 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1176 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1177
1178 lck->lck_w_waiting = TRUE;
1179
1180 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1181 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1182 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1183 lck_interlock_unlock(lck, istate);
1184
1185 if (res == THREAD_WAITING) {
1186 res = thread_block(THREAD_CONTINUE_NULL);
1187 slept++;
1188 }
1189 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1190 } else {
1191 lck_interlock_unlock(lck, istate);
1192 /*
1193 * must own the lock now, since we checked for
1194 * readers or upgrade owner behind the interlock
1195 * no need for a call to 'lck_rw_held_read_or_upgrade'
1196 */
1197 break;
1198 }
1199 }
1200 }
1201
1202 #if CONFIG_DTRACE
1203 /*
1204 * Decide what latencies we suffered that are Dtrace events.
1205 * If we have set wait_interval, then we either spun or slept.
1206 * At least we get out from under the interlock before we record
1207 * which is the best we can do here to minimize the impact
1208 * of the tracing.
1209 * If we have set wait_interval to -1, then dtrace was not enabled when we
1210 * started sleeping/spinning so we don't record this event.
1211 */
1212 if (dtrace_ls_enabled == TRUE) {
1213 if (slept == 0) {
1214 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1215 mach_absolute_time() - wait_interval, 1);
1216 } else {
1217 /*
1218 * For the blocking case, we also record if when we blocked
1219 * it was held for read or write, and how many readers.
1220 * Notice that above we recorded this before we dropped
1221 * the interlock so the count is accurate.
1222 */
1223 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1224 mach_absolute_time() - wait_interval, 1,
1225 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1226 }
1227 }
1228 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1229 #endif
1230 }
1231
1232 /*
1233 * Routine: lck_rw_done
1234 */
1235
1236 lck_rw_type_t lck_rw_done(lck_rw_t *lock)
1237 {
1238 uint32_t data, prev;
1239
1240 for ( ; ; ) {
1241 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1242 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
1243 atomic_exchange_abort();
1244 lck_rw_interlock_spin(lock);
1245 continue;
1246 }
1247 if (data & LCK_RW_SHARED_MASK) {
1248 data -= LCK_RW_SHARED_READER;
1249 if ((data & LCK_RW_SHARED_MASK) == 0) /* if reader count has now gone to 0, check for waiters */
1250 goto check_waiters;
1251 } else { /* if reader count == 0, must be exclusive lock */
1252 if (data & LCK_RW_WANT_UPGRADE) {
1253 data &= ~(LCK_RW_WANT_UPGRADE);
1254 } else {
1255 if (data & LCK_RW_WANT_WRITE)
1256 data &= ~(LCK_RW_WANT_EXCL);
1257 else /* lock is not 'owned', panic */
1258 panic("Releasing non-exclusive RW lock without a reader refcount!");
1259 }
1260 check_waiters:
1261 if (prev & LCK_RW_W_WAITING) {
1262 data &= ~(LCK_RW_W_WAITING);
1263 if ((prev & LCK_RW_PRIV_EXCL) == 0)
1264 data &= ~(LCK_RW_R_WAITING);
1265 } else
1266 data &= ~(LCK_RW_R_WAITING);
1267 }
1268 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1269 break;
1270 cpu_pause();
1271 }
1272 return lck_rw_done_gen(lock, prev);
1273 }
1274
1275 /*
1276 * Routine: lck_rw_done_gen
1277 *
1278 * called from lck_rw_done()
1279 * prior_lock_state is the value in the 1st
1280 * word of the lock at the time of a successful
1281 * atomic compare and exchange with the new value...
1282 * it represents the state of the lock before we
1283 * decremented the rw_shared_count or cleared either
1284 * rw_want_upgrade or rw_want_write and
1285 * the lck_x_waiting bits... since the wrapper
1286 * routine has already changed the state atomically,
1287 * we just need to decide if we should
1288 * wake up anyone and what value to return... we do
1289 * this by examining the state of the lock before
1290 * we changed it
1291 */
1292 static lck_rw_type_t
1293 lck_rw_done_gen(
1294 lck_rw_t *lck,
1295 uint32_t prior_lock_state)
1296 {
1297 lck_rw_t *fake_lck;
1298 lck_rw_type_t lock_type;
1299 thread_t thread;
1300 uint32_t rwlock_count;
1301
1302 /*
1303 * prior_lock state is a snapshot of the 1st word of the
1304 * lock in question... we'll fake up a pointer to it
1305 * and carefully not access anything beyond whats defined
1306 * in the first word of a lck_rw_t
1307 */
1308 fake_lck = (lck_rw_t *)&prior_lock_state;
1309
1310 if (fake_lck->lck_rw_shared_count <= 1) {
1311 if (fake_lck->lck_w_waiting)
1312 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1313
1314 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1315 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1316 }
1317 if (fake_lck->lck_rw_shared_count)
1318 lock_type = LCK_RW_TYPE_SHARED;
1319 else
1320 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1321
1322 /* Check if dropping the lock means that we need to unpromote */
1323 thread = current_thread();
1324 rwlock_count = thread->rwlock_count--;
1325 #if MACH_LDEBUG
1326 if (rwlock_count == 0) {
1327 panic("rw lock count underflow for thread %p", thread);
1328 }
1329 #endif
1330 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1331 /* sched_flags checked without lock, but will be rechecked while clearing */
1332 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1333 }
1334
1335 #if CONFIG_DTRACE
1336 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1337 #endif
1338
1339 return(lock_type);
1340 }
1341
1342
1343 /*
1344 * Routine: lck_rw_unlock
1345 */
1346 void
1347 lck_rw_unlock(
1348 lck_rw_t *lck,
1349 lck_rw_type_t lck_rw_type)
1350 {
1351 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1352 lck_rw_unlock_shared(lck);
1353 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1354 lck_rw_unlock_exclusive(lck);
1355 else
1356 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1357 }
1358
1359
1360 /*
1361 * Routine: lck_rw_unlock_shared
1362 */
1363 void
1364 lck_rw_unlock_shared(
1365 lck_rw_t *lck)
1366 {
1367 lck_rw_type_t ret;
1368
1369 assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1370 ret = lck_rw_done(lck);
1371
1372 if (ret != LCK_RW_TYPE_SHARED)
1373 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1374 }
1375
1376
1377 /*
1378 * Routine: lck_rw_unlock_exclusive
1379 */
1380 void
1381 lck_rw_unlock_exclusive(
1382 lck_rw_t *lck)
1383 {
1384 lck_rw_type_t ret;
1385
1386 ret = lck_rw_done(lck);
1387
1388 if (ret != LCK_RW_TYPE_EXCLUSIVE)
1389 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1390 }
1391
1392
1393 /*
1394 * Routine: lck_rw_lock
1395 */
1396 void
1397 lck_rw_lock(
1398 lck_rw_t *lck,
1399 lck_rw_type_t lck_rw_type)
1400 {
1401 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1402 lck_rw_lock_shared(lck);
1403 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1404 lck_rw_lock_exclusive(lck);
1405 else
1406 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1407 }
1408
1409 /*
1410 * Routine: lck_rw_lock_shared
1411 */
1412 void
1413 lck_rw_lock_shared(lck_rw_t *lock)
1414 {
1415 uint32_t data, prev;
1416
1417 current_thread()->rwlock_count++;
1418 for ( ; ; ) {
1419 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1420 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1421 atomic_exchange_abort();
1422 lck_rw_lock_shared_gen(lock);
1423 break;
1424 }
1425 data += LCK_RW_SHARED_READER;
1426 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1427 break;
1428 cpu_pause();
1429 }
1430 #if CONFIG_DTRACE
1431 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1432 #endif /* CONFIG_DTRACE */
1433 return;
1434 }
1435
1436 /*
1437 * Routine: lck_rw_lock_shared_gen
1438 * Function:
1439 * assembly fast path code has determined that this lock
1440 * is held exclusively... this is where we spin/block
1441 * until we can acquire the lock in the shared mode
1442 */
1443 static void
1444 lck_rw_lock_shared_gen(
1445 lck_rw_t *lck)
1446 {
1447 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1448 uint64_t deadline = 0;
1449 int gotlock = 0;
1450 int slept = 0;
1451 wait_result_t res = 0;
1452 boolean_t istate = -1;
1453
1454 #if CONFIG_DTRACE
1455 uint64_t wait_interval = 0;
1456 int readers_at_sleep = 0;
1457 boolean_t dtrace_ls_initialized = FALSE;
1458 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1459 #endif
1460
1461 while ( !lck_rw_grab_shared(lck)) {
1462
1463 #if CONFIG_DTRACE
1464 if (dtrace_ls_initialized == FALSE) {
1465 dtrace_ls_initialized = TRUE;
1466 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1467 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1468 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1469 if (dtrace_ls_enabled) {
1470 /*
1471 * Either sleeping or spinning is happening,
1472 * start a timing of our delay interval now.
1473 */
1474 readers_at_sleep = lck->lck_rw_shared_count;
1475 wait_interval = mach_absolute_time();
1476 }
1477 }
1478 #endif
1479 if (istate == -1)
1480 istate = ml_get_interrupts_enabled();
1481
1482 deadline = lck_rw_deadline_for_spin(lck);
1483
1484 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1485 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1486
1487 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1488 lck_rw_lock_pause(istate);
1489
1490 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1491 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1492
1493 if (gotlock)
1494 break;
1495 /*
1496 * if we get here, the deadline has expired w/o us
1497 * being able to grab the lock for read
1498 * check to see if we're allowed to do a thread_block
1499 */
1500 if (lck->lck_rw_can_sleep) {
1501
1502 istate = lck_interlock_lock(lck);
1503
1504 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1505 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1506
1507 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1508 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1509
1510 lck->lck_r_waiting = TRUE;
1511
1512 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1513 res = assert_wait(RW_LOCK_READER_EVENT(lck),
1514 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1515 lck_interlock_unlock(lck, istate);
1516
1517 if (res == THREAD_WAITING) {
1518 res = thread_block(THREAD_CONTINUE_NULL);
1519 slept++;
1520 }
1521 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1522 trace_lck, res, slept, 0, 0);
1523 } else {
1524 lck->lck_rw_shared_count++;
1525 lck_interlock_unlock(lck, istate);
1526 break;
1527 }
1528 }
1529 }
1530
1531 #if CONFIG_DTRACE
1532 if (dtrace_ls_enabled == TRUE) {
1533 if (slept == 0) {
1534 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1535 } else {
1536 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1537 mach_absolute_time() - wait_interval, 0,
1538 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1539 }
1540 }
1541 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1542 #endif
1543 }
1544
1545
1546 /*
1547 * Routine: lck_rw_lock_exclusive
1548 */
1549
1550 void
1551 lck_rw_lock_exclusive(lck_rw_t *lock)
1552 {
1553 current_thread()->rwlock_count++;
1554 if (atomic_test_and_set32(&lock->data,
1555 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1556 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1557 #if CONFIG_DTRACE
1558 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1559 #endif /* CONFIG_DTRACE */
1560 } else
1561 lck_rw_lock_exclusive_gen(lock);
1562 }
1563
1564
1565 /*
1566 * Routine: lck_rw_lock_shared_to_exclusive
1567 */
1568
1569 boolean_t
1570 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1571 {
1572 uint32_t data, prev;
1573
1574 for ( ; ; ) {
1575 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1576 if (data & LCK_RW_INTERLOCK) {
1577 atomic_exchange_abort();
1578 lck_rw_interlock_spin(lock);
1579 continue;
1580 }
1581 if (data & LCK_RW_WANT_UPGRADE) {
1582 data -= LCK_RW_SHARED_READER;
1583 if ((data & LCK_RW_SHARED_MASK) == 0) /* we were the last reader */
1584 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1585 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1586 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1587 } else {
1588 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1589 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1590 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1591 break;
1592 }
1593 cpu_pause();
1594 }
1595 /* we now own the WANT_UPGRADE */
1596 if (data & LCK_RW_SHARED_MASK) /* check to see if all of the readers are drained */
1597 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1598 #if CONFIG_DTRACE
1599 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1600 #endif
1601 return TRUE;
1602 }
1603
1604
1605 /*
1606 * Routine: lck_rw_lock_shared_to_exclusive_failure
1607 * Function:
1608 * assembly fast path code has already dropped our read
1609 * count and determined that someone else owns 'lck_rw_want_upgrade'
1610 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1611 * all we need to do here is determine if a wakeup is needed
1612 */
1613 static boolean_t
1614 lck_rw_lock_shared_to_exclusive_failure(
1615 lck_rw_t *lck,
1616 uint32_t prior_lock_state)
1617 {
1618 lck_rw_t *fake_lck;
1619 thread_t thread = current_thread();
1620 uint32_t rwlock_count;
1621
1622 /* Check if dropping the lock means that we need to unpromote */
1623 rwlock_count = thread->rwlock_count--;
1624 #if MACH_LDEBUG
1625 if (rwlock_count == 0) {
1626 panic("rw lock count underflow for thread %p", thread);
1627 }
1628 #endif
1629 fake_lck = (lck_rw_t *)&prior_lock_state;
1630
1631 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1632 /*
1633 * Someone else has requested upgrade.
1634 * Since we've released the read lock, wake
1635 * him up if he's blocked waiting
1636 */
1637 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1638 }
1639
1640 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1641 /* sched_flags checked without lock, but will be rechecked while clearing */
1642 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1643 }
1644
1645 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1646 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1647
1648 return (FALSE);
1649 }
1650
1651
1652 /*
1653 * Routine: lck_rw_lock_shared_to_exclusive_failure
1654 * Function:
1655 * assembly fast path code has already dropped our read
1656 * count and successfully acquired 'lck_rw_want_upgrade'
1657 * we just need to wait for the rest of the readers to drain
1658 * and then we can return as the exclusive holder of this lock
1659 */
1660 static boolean_t
1661 lck_rw_lock_shared_to_exclusive_success(
1662 lck_rw_t *lck)
1663 {
1664 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1665 uint64_t deadline = 0;
1666 int slept = 0;
1667 int still_shared = 0;
1668 wait_result_t res;
1669 boolean_t istate = -1;
1670
1671 #if CONFIG_DTRACE
1672 uint64_t wait_interval = 0;
1673 int readers_at_sleep = 0;
1674 boolean_t dtrace_ls_initialized = FALSE;
1675 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1676 #endif
1677
1678 while (lck->lck_rw_shared_count != 0) {
1679
1680 #if CONFIG_DTRACE
1681 if (dtrace_ls_initialized == FALSE) {
1682 dtrace_ls_initialized = TRUE;
1683 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1684 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1685 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1686 if (dtrace_ls_enabled) {
1687 /*
1688 * Either sleeping or spinning is happening,
1689 * start a timing of our delay interval now.
1690 */
1691 readers_at_sleep = lck->lck_rw_shared_count;
1692 wait_interval = mach_absolute_time();
1693 }
1694 }
1695 #endif
1696 if (istate == -1)
1697 istate = ml_get_interrupts_enabled();
1698
1699 deadline = lck_rw_deadline_for_spin(lck);
1700
1701 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1702 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1703
1704 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1705 lck_rw_lock_pause(istate);
1706
1707 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1708 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1709
1710 if ( !still_shared)
1711 break;
1712 /*
1713 * if we get here, the deadline has expired w/o
1714 * the rw_shared_count having drained to 0
1715 * check to see if we're allowed to do a thread_block
1716 */
1717 if (lck->lck_rw_can_sleep) {
1718
1719 istate = lck_interlock_lock(lck);
1720
1721 if (lck->lck_rw_shared_count != 0) {
1722 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1723 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1724
1725 lck->lck_w_waiting = TRUE;
1726
1727 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1728 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1729 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1730 lck_interlock_unlock(lck, istate);
1731
1732 if (res == THREAD_WAITING) {
1733 res = thread_block(THREAD_CONTINUE_NULL);
1734 slept++;
1735 }
1736 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1737 trace_lck, res, slept, 0, 0);
1738 } else {
1739 lck_interlock_unlock(lck, istate);
1740 break;
1741 }
1742 }
1743 }
1744 #if CONFIG_DTRACE
1745 /*
1746 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1747 */
1748 if (dtrace_ls_enabled == TRUE) {
1749 if (slept == 0) {
1750 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1751 } else {
1752 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1753 mach_absolute_time() - wait_interval, 1,
1754 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1755 }
1756 }
1757 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1758 #endif
1759 return (TRUE);
1760 }
1761
1762 /*
1763 * Routine: lck_rw_lock_exclusive_to_shared
1764 */
1765
1766 void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1767 {
1768 uint32_t data, prev;
1769
1770 for ( ; ; ) {
1771 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1772 if (data & LCK_RW_INTERLOCK) {
1773 atomic_exchange_abort();
1774 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
1775 continue;
1776 }
1777 data += LCK_RW_SHARED_READER;
1778 if (data & LCK_RW_WANT_UPGRADE)
1779 data &= ~(LCK_RW_WANT_UPGRADE);
1780 else
1781 data &= ~(LCK_RW_WANT_EXCL);
1782 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL)))
1783 data &= ~(LCK_RW_W_WAITING);
1784 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1785 break;
1786 cpu_pause();
1787 }
1788 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1789 }
1790
1791
1792 /*
1793 * Routine: lck_rw_lock_exclusive_to_shared_gen
1794 * Function:
1795 * assembly fast path has already dropped
1796 * our exclusive state and bumped lck_rw_shared_count
1797 * all we need to do here is determine if anyone
1798 * needs to be awakened.
1799 */
1800 static void
1801 lck_rw_lock_exclusive_to_shared_gen(
1802 lck_rw_t *lck,
1803 uint32_t prior_lock_state)
1804 {
1805 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1806 lck_rw_t *fake_lck;
1807
1808 fake_lck = (lck_rw_t *)&prior_lock_state;
1809
1810 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1811 trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1812
1813 /*
1814 * don't wake up anyone waiting to take the lock exclusively
1815 * since we hold a read count... when the read count drops to 0,
1816 * the writers will be woken.
1817 *
1818 * wake up any waiting readers if we don't have any writers waiting,
1819 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1820 */
1821 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1822 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1823
1824 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1825 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1826
1827 #if CONFIG_DTRACE
1828 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1829 #endif
1830 }
1831
1832
1833 /*
1834 * Routine: lck_rw_try_lock
1835 */
1836 boolean_t
1837 lck_rw_try_lock(
1838 lck_rw_t *lck,
1839 lck_rw_type_t lck_rw_type)
1840 {
1841 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1842 return(lck_rw_try_lock_shared(lck));
1843 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1844 return(lck_rw_try_lock_exclusive(lck));
1845 else
1846 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1847 return(FALSE);
1848 }
1849
1850 /*
1851 * Routine: lck_rw_try_lock_shared
1852 */
1853
1854 boolean_t lck_rw_try_lock_shared(lck_rw_t *lock)
1855 {
1856 uint32_t data, prev;
1857
1858 for ( ; ; ) {
1859 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1860 if (data & LCK_RW_INTERLOCK) {
1861 atomic_exchange_abort();
1862 lck_rw_interlock_spin(lock);
1863 continue;
1864 }
1865 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1866 atomic_exchange_abort();
1867 return FALSE; /* lock is busy */
1868 }
1869 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
1870 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1871 break;
1872 cpu_pause();
1873 }
1874 current_thread()->rwlock_count++;
1875 /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1876 #if CONFIG_DTRACE
1877 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1878 #endif /* CONFIG_DTRACE */
1879 return TRUE;
1880 }
1881
1882
1883 /*
1884 * Routine: lck_rw_try_lock_exclusive
1885 */
1886
1887 boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock)
1888 {
1889 uint32_t data, prev;
1890
1891 for ( ; ; ) {
1892 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1893 if (data & LCK_RW_INTERLOCK) {
1894 atomic_exchange_abort();
1895 lck_rw_interlock_spin(lock);
1896 continue;
1897 }
1898 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1899 atomic_exchange_abort();
1900 return FALSE; /* can't get it */
1901 }
1902 data |= LCK_RW_WANT_EXCL;
1903 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1904 break;
1905 cpu_pause();
1906 }
1907
1908 current_thread()->rwlock_count++;
1909 #if CONFIG_DTRACE
1910 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1911 #endif /* CONFIG_DTRACE */
1912 return TRUE;
1913 }
1914
1915
1916 void
1917 lck_rw_assert(
1918 lck_rw_t *lck,
1919 unsigned int type)
1920 {
1921 switch (type) {
1922 case LCK_RW_ASSERT_SHARED:
1923 if (lck->lck_rw_shared_count != 0) {
1924 return;
1925 }
1926 break;
1927 case LCK_RW_ASSERT_EXCLUSIVE:
1928 if ((lck->lck_rw_want_write ||
1929 lck->lck_rw_want_upgrade) &&
1930 lck->lck_rw_shared_count == 0) {
1931 return;
1932 }
1933 break;
1934 case LCK_RW_ASSERT_HELD:
1935 if (lck->lck_rw_want_write ||
1936 lck->lck_rw_want_upgrade ||
1937 lck->lck_rw_shared_count != 0) {
1938 return;
1939 }
1940 break;
1941 case LCK_RW_ASSERT_NOTHELD:
1942 if (!(lck->lck_rw_want_write ||
1943 lck->lck_rw_want_upgrade ||
1944 lck->lck_rw_shared_count != 0)) {
1945 return;
1946 }
1947 break;
1948 default:
1949 break;
1950 }
1951
1952 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1953 }
1954
1955 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1956 void
1957 lck_rw_clear_promotions_x86(thread_t thread)
1958 {
1959 #if MACH_LDEBUG
1960 /* It's fatal to leave a RW lock locked and return to userspace */
1961 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1962 #else
1963 /* Paper over the issue */
1964 thread->rwlock_count = 0;
1965 lck_rw_clear_promotion(thread, 0);
1966 #endif
1967 }
1968
1969 boolean_t
1970 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
1971 {
1972 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
1973
1974 if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
1975 lck_rw_unlock_shared(lck);
1976 mutex_pause(2);
1977 lck_rw_lock_shared(lck);
1978 return TRUE;
1979 }
1980
1981 return FALSE;
1982 }
1983
1984 /*
1985 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1986 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1987 */
1988 boolean_t
1989 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
1990 if (not_in_kdp) {
1991 panic("panic: rw lock exclusive check done outside of kernel debugger");
1992 }
1993 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
1994 }
1995
1996 /*
1997 * Slow path routines for lck_mtx locking and unlocking functions.
1998 *
1999 * These functions were previously implemented in x86 assembly,
2000 * and some optimizations are in place in this c code to obtain a compiled code
2001 * as performant and compact as the assembly version.
2002 *
2003 * To avoid to inline these functions on the fast path, all functions directly called by
2004 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2005 * in such a way the fast path can tail call into them. In this way the return address
2006 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2007 *
2008 * Slow path code is structured in such a way there are no calls to functions that will return
2009 * on the context of the caller function, i.e. all functions called are or tail call functions
2010 * or inline functions. The number of arguments of the tail call functions are less then six,
2011 * so that they can be passed over registers and do not need to be pushed on stack.
2012 * This allows the compiler to not create a stack frame for the functions.
2013 *
2014 * __improbable and __probable are used to compile the slow path code in such a way
2015 * the fast path case will be on a sequence of instructions with as less jumps as possible,
2016 * to make this case the most optimized even if falling through the slow path.
2017 */
2018
2019 /*
2020 * Intel lock invariants:
2021 *
2022 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2023 * lck_mtx_pri: contains the max priority of all waiters during a contention period
2024 * not cleared on last unlock, but stomped over on next first contention
2025 * lck_mtx_promoted: set when the current lock owner has been promoted
2026 * cleared when lock owner unlocks, set on acquire or wait.
2027 *
2028 * The lock owner is promoted to the max priority of all its waiters only if it
2029 * was a lower priority when it acquired or was an owner when a waiter waited.
2030 * Max priority is capped at MAXPRI_PROMOTE.
2031 *
2032 * The last waiter will not be promoted as it is woken up, but the last
2033 * lock owner may not have been the last thread to have been woken up depending on the
2034 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
2035 * flag set.
2036 *
2037 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2038 * priority from dropping priority in the future without having to take thread lock
2039 * on acquire.
2040 */
2041
2042 #ifdef MUTEX_ZONE
2043 extern zone_t lck_mtx_zone;
2044 #endif
2045
2046 /*
2047 * N.B.: On x86, statistics are currently recorded for all indirect mutexes.
2048 * Also, only the acquire attempt count (GRP_MTX_STAT_UTIL) is maintained
2049 * as a 64-bit quantity (the new x86 specific statistics are also maintained
2050 * as 32-bit quantities).
2051 *
2052 *
2053 * Enable this preprocessor define to record the first miss alone
2054 * By default, we count every miss, hence multiple misses may be
2055 * recorded for a single lock acquire attempt via lck_mtx_lock
2056 */
2057 #undef LOG_FIRST_MISS_ALONE
2058
2059 /*
2060 * This preprocessor define controls whether the R-M-W update of the
2061 * per-group statistics elements are atomic (LOCK-prefixed)
2062 * Enabled by default.
2063 */
2064 #define ATOMIC_STAT_UPDATES 1
2065
2066
2067 /*
2068 * Routine: lck_mtx_alloc_init
2069 */
2070 lck_mtx_t *
2071 lck_mtx_alloc_init(
2072 lck_grp_t *grp,
2073 lck_attr_t *attr)
2074 {
2075 lck_mtx_t *lck;
2076 #ifdef MUTEX_ZONE
2077 if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
2078 lck_mtx_init(lck, grp, attr);
2079 #else
2080 if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
2081 lck_mtx_init(lck, grp, attr);
2082 #endif
2083 return(lck);
2084 }
2085
2086 /*
2087 * Routine: lck_mtx_free
2088 */
2089 void
2090 lck_mtx_free(
2091 lck_mtx_t *lck,
2092 lck_grp_t *grp)
2093 {
2094 lck_mtx_destroy(lck, grp);
2095 #ifdef MUTEX_ZONE
2096 zfree(lck_mtx_zone, lck);
2097 #else
2098 kfree(lck, sizeof(lck_mtx_t));
2099 #endif
2100 }
2101
2102 /*
2103 * Routine: lck_mtx_ext_init
2104 */
2105 static void
2106 lck_mtx_ext_init(
2107 lck_mtx_ext_t *lck,
2108 lck_grp_t *grp,
2109 lck_attr_t *attr)
2110 {
2111 bzero((void *)lck, sizeof(lck_mtx_ext_t));
2112
2113 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2114 lck->lck_mtx_deb.type = MUTEX_TAG;
2115 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2116 }
2117
2118 lck->lck_mtx_grp = grp;
2119
2120 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
2121 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2122
2123 lck->lck_mtx.lck_mtx_is_ext = 1;
2124 lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
2125 }
2126
2127 /*
2128 * Routine: lck_mtx_init
2129 */
2130 void
2131 lck_mtx_init(
2132 lck_mtx_t *lck,
2133 lck_grp_t *grp,
2134 lck_attr_t *attr)
2135 {
2136 lck_mtx_ext_t *lck_ext;
2137 lck_attr_t *lck_attr;
2138
2139 if (attr != LCK_ATTR_NULL)
2140 lck_attr = attr;
2141 else
2142 lck_attr = &LockDefaultLckAttr;
2143
2144 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2145 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2146 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2147 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2148 lck->lck_mtx_ptr = lck_ext;
2149 }
2150 } else {
2151 lck->lck_mtx_owner = 0;
2152 lck->lck_mtx_state = 0;
2153 }
2154 lck->lck_mtx_pad32 = 0xFFFFFFFF;
2155 lck_grp_reference(grp);
2156 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2157 }
2158
2159 /*
2160 * Routine: lck_mtx_init_ext
2161 */
2162 void
2163 lck_mtx_init_ext(
2164 lck_mtx_t *lck,
2165 lck_mtx_ext_t *lck_ext,
2166 lck_grp_t *grp,
2167 lck_attr_t *attr)
2168 {
2169 lck_attr_t *lck_attr;
2170
2171 if (attr != LCK_ATTR_NULL)
2172 lck_attr = attr;
2173 else
2174 lck_attr = &LockDefaultLckAttr;
2175
2176 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2177 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2178 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2179 lck->lck_mtx_ptr = lck_ext;
2180 } else {
2181 lck->lck_mtx_owner = 0;
2182 lck->lck_mtx_state = 0;
2183 }
2184 lck->lck_mtx_pad32 = 0xFFFFFFFF;
2185
2186 lck_grp_reference(grp);
2187 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2188 }
2189
2190 static void
2191 lck_mtx_lock_mark_destroyed(
2192 lck_mtx_t *mutex,
2193 boolean_t indirect)
2194 {
2195 uint32_t state;
2196
2197 if (indirect) {
2198 /* convert to destroyed state */
2199 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2200 return;
2201 }
2202
2203 state = ordered_load_mtx_state(mutex);
2204 lck_mtx_interlock_lock(mutex, &state);
2205
2206 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2207
2208 enable_preemption();
2209 }
2210
2211 /*
2212 * Routine: lck_mtx_destroy
2213 */
2214 void
2215 lck_mtx_destroy(
2216 lck_mtx_t *lck,
2217 lck_grp_t *grp)
2218 {
2219 boolean_t indirect;
2220
2221 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
2222 return;
2223 #if MACH_LDEBUG
2224 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2225 #endif
2226 indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2227
2228 lck_mtx_lock_mark_destroyed(lck, indirect);
2229
2230 if (indirect)
2231 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2232 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2233 lck_grp_deallocate(grp);
2234 return;
2235 }
2236
2237
2238 #if DEVELOPMENT | DEBUG
2239 __attribute__((noinline))
2240 void
2241 lck_mtx_owner_check_panic(
2242 lck_mtx_t *lock)
2243 {
2244 thread_t owner = (thread_t)lock->lck_mtx_owner;
2245 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2246 }
2247 #endif
2248
2249 __attribute__((always_inline))
2250 static boolean_t
2251 get_indirect_mutex(
2252 lck_mtx_t **lock,
2253 uint32_t *state)
2254 {
2255 *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2256 *state = ordered_load_mtx_state(*lock);
2257 return TRUE;
2258 }
2259
2260 /*
2261 * Routine: lck_mtx_unlock_slow
2262 *
2263 * Unlocks a mutex held by current thread.
2264 *
2265 * It will wake up waiters if necessary and
2266 * drop promotions.
2267 *
2268 * Interlock can be held.
2269 */
2270 __attribute__((noinline))
2271 void
2272 lck_mtx_unlock_slow(
2273 lck_mtx_t *lock)
2274 {
2275 thread_t thread;
2276 uint32_t state, prev;
2277 boolean_t indirect = FALSE;
2278
2279 state = ordered_load_mtx_state(lock);
2280
2281 /* Is this an indirect mutex? */
2282 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2283 indirect = get_indirect_mutex(&lock, &state);
2284 }
2285
2286 thread = current_thread();
2287
2288 #if DEVELOPMENT | DEBUG
2289 thread_t owner = (thread_t)lock->lck_mtx_owner;
2290 if(__improbable(owner != thread))
2291 return lck_mtx_owner_check_panic(lock);
2292 #endif
2293
2294 /* check if it is held as a spinlock */
2295 if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0))
2296 goto unlock;
2297
2298 lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2299
2300 unlock:
2301 /* preemption disabled, interlock held and mutex not held */
2302
2303 /* clear owner */
2304 ordered_store_mtx_owner(lock, 0);
2305 /* keep original state in prev for later evaluation */
2306 prev = state;
2307 /* release interlock, promotion and clear spin flag */
2308 state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK | LCK_MTX_PROMOTED_MSK));
2309 if ((state & LCK_MTX_WAITERS_MSK))
2310 state -= LCK_MTX_WAITER; /* decrement waiter count */
2311 ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */
2312
2313 #if MACH_LDEBUG
2314 /* perform lock statistics after drop to prevent delay */
2315 if (thread)
2316 thread->mutex_count--; /* lock statistic */
2317 #endif /* MACH_LDEBUG */
2318
2319 /* check if there are waiters to wake up or priority to drop */
2320 if ((prev & (LCK_MTX_PROMOTED_MSK | LCK_MTX_WAITERS_MSK)))
2321 return lck_mtx_unlock_wakeup_tail(lock, prev, indirect);
2322
2323 /* re-enable preemption */
2324 lck_mtx_unlock_finish_inline(lock, FALSE);
2325
2326 return;
2327 }
2328
2329 #define LCK_MTX_LCK_WAIT_CODE 0x20
2330 #define LCK_MTX_LCK_WAKEUP_CODE 0x21
2331 #define LCK_MTX_LCK_SPIN_CODE 0x22
2332 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2333 #define LCK_MTX_LCK_DEMOTE_CODE 0x24
2334
2335 /*
2336 * Routine: lck_mtx_unlock_wakeup_tail
2337 *
2338 * Invoked on unlock when there is
2339 * contention, i.e. the assembly routine sees
2340 * that mutex->lck_mtx_waiters != 0 or
2341 * that mutex->lck_mtx_promoted != 0
2342 *
2343 * neither the mutex or interlock is held
2344 *
2345 * Note that this routine might not be called if there are pending
2346 * waiters which have previously been woken up, and they didn't
2347 * end up boosting the old owner.
2348 *
2349 * assembly routine previously did the following to mutex:
2350 * (after saving the state in prior_lock_state)
2351 * cleared lck_mtx_promoted
2352 * decremented lck_mtx_waiters if nonzero
2353 *
2354 * This function needs to be called as a tail call
2355 * to optimize the compiled code.
2356 */
2357 __attribute__((noinline))
2358 static void
2359 lck_mtx_unlock_wakeup_tail (
2360 lck_mtx_t *mutex,
2361 int prior_lock_state,
2362 boolean_t indirect)
2363 {
2364 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2365 lck_mtx_t fake_lck;
2366
2367 /*
2368 * prior_lock state is a snapshot of the 2nd word of the
2369 * lock in question... we'll fake up a lock with the bits
2370 * copied into place and carefully not access anything
2371 * beyond whats defined in the second word of a lck_mtx_t
2372 */
2373 fake_lck.lck_mtx_state = prior_lock_state;
2374
2375 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
2376 trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
2377
2378 if (__probable(fake_lck.lck_mtx_waiters)) {
2379 kern_return_t did_wake;
2380
2381 if (fake_lck.lck_mtx_waiters > 1)
2382 did_wake = thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
2383 else
2384 did_wake = thread_wakeup_one(LCK_MTX_EVENT(mutex));
2385 /*
2386 * The waiters count always precisely matches the number of threads on the waitqueue.
2387 * i.e. we should never see ret == KERN_NOT_WAITING.
2388 */
2389 assert(did_wake == KERN_SUCCESS);
2390 }
2391
2392 /* When lck_mtx_promoted was set, then I as the owner definitely have a promotion */
2393 if (__improbable(fake_lck.lck_mtx_promoted)) {
2394 thread_t thread = current_thread();
2395
2396 spl_t s = splsched();
2397 thread_lock(thread);
2398
2399 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
2400 thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
2401 assert(thread->was_promoted_on_wakeup == 0);
2402 assert(thread->promotions > 0);
2403
2404 assert_promotions_invariant(thread);
2405
2406 if (--thread->promotions == 0)
2407 sched_thread_unpromote(thread, trace_lck);
2408
2409 assert_promotions_invariant(thread);
2410
2411 thread_unlock(thread);
2412 splx(s);
2413 }
2414
2415 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
2416 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2417
2418 lck_mtx_unlock_finish_inline(mutex, indirect);
2419 }
2420
2421 /*
2422 * Routine: lck_mtx_lock_acquire_x86
2423 *
2424 * Invoked on acquiring the mutex when there is
2425 * contention (i.e. the assembly routine sees that
2426 * that mutex->lck_mtx_waiters != 0 or
2427 * thread->was_promoted_on_wakeup != 0)...
2428 *
2429 * mutex is owned... interlock is held... preemption is disabled
2430 */
2431 __attribute__((always_inline))
2432 static void
2433 lck_mtx_lock_acquire_inline(
2434 lck_mtx_t *mutex)
2435 {
2436 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2437 integer_t priority;
2438
2439 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
2440 trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2441
2442 if (mutex->lck_mtx_waiters)
2443 priority = mutex->lck_mtx_pri;
2444 else
2445 priority = 0; /* not worth resetting lck_mtx_pri here, it will be reset by next waiter */
2446
2447 /* the priority must have been set correctly by wait */
2448 assert(priority <= MAXPRI_PROMOTE);
2449 assert(priority == 0 || priority >= BASEPRI_DEFAULT);
2450
2451 /* if the mutex wasn't owned, then the owner wasn't promoted */
2452 assert(mutex->lck_mtx_promoted == 0);
2453
2454 thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */
2455
2456 if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
2457 spl_t s = splsched();
2458 thread_lock(thread);
2459
2460 if (thread->was_promoted_on_wakeup)
2461 assert(thread->promotions > 0);
2462
2463 /* Intel only promotes if priority goes up */
2464 if (thread->sched_pri < priority && thread->promotion_priority < priority) {
2465 /* Remember that I need to drop this promotion on unlock */
2466 mutex->lck_mtx_promoted = 1;
2467
2468 if (thread->promotions++ == 0) {
2469 /* This is the first promotion for the owner */
2470 sched_thread_promote_to_pri(thread, priority, trace_lck);
2471 } else {
2472 /*
2473 * Holder was previously promoted due to a different mutex,
2474 * raise to match this one.
2475 * Or, this thread was promoted on wakeup but someone else
2476 * later contended on mutex at higher priority before we got here
2477 */
2478 sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
2479 }
2480 }
2481
2482 if (thread->was_promoted_on_wakeup) {
2483 thread->was_promoted_on_wakeup = 0;
2484 if (--thread->promotions == 0)
2485 sched_thread_unpromote(thread, trace_lck);
2486 }
2487
2488 thread_unlock(thread);
2489 splx(s);
2490 }
2491 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
2492 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2493 }
2494
2495 void
2496 lck_mtx_lock_acquire_x86(
2497 lck_mtx_t *mutex)
2498 {
2499 return lck_mtx_lock_acquire_inline(mutex);
2500 }
2501
2502 /*
2503 * Tail call helpers for lock functions that perform
2504 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2505 * the caller's compiled code.
2506 */
2507
2508 __attribute__((noinline))
2509 static void
2510 lck_mtx_lock_acquire_tail(
2511 lck_mtx_t *mutex,
2512 boolean_t indirect)
2513 {
2514 lck_mtx_lock_acquire_inline(mutex);
2515 lck_mtx_lock_finish_inline(mutex, ordered_load_mtx_state(mutex), indirect);
2516 }
2517
2518 __attribute__((noinline))
2519 static boolean_t
2520 lck_mtx_try_lock_acquire_tail(
2521 lck_mtx_t *mutex)
2522 {
2523 lck_mtx_lock_acquire_inline(mutex);
2524 lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2525
2526 return TRUE;
2527 }
2528
2529 __attribute__((noinline))
2530 static void
2531 lck_mtx_convert_spin_acquire_tail(
2532 lck_mtx_t *mutex)
2533 {
2534 lck_mtx_lock_acquire_inline(mutex);
2535 lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2536 }
2537
2538 boolean_t
2539 lck_mtx_ilk_unlock(
2540 lck_mtx_t *mutex)
2541 {
2542 lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2543 return TRUE;
2544 }
2545
2546 static inline void
2547 lck_mtx_interlock_lock_set_and_clear_flags(
2548 lck_mtx_t *mutex,
2549 uint32_t xor_flags,
2550 uint32_t and_flags,
2551 uint32_t *new_state)
2552 {
2553 uint32_t state, prev;
2554 state = *new_state;
2555
2556 for ( ; ; ) {
2557 /* have to wait for interlock to clear */
2558 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2559 cpu_pause();
2560 state = ordered_load_mtx_state(mutex);
2561 }
2562 prev = state; /* prev contains snapshot for exchange */
2563 state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */
2564 state &= ~and_flags; /* clear flags */
2565
2566 disable_preemption();
2567 if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE))
2568 break;
2569 enable_preemption();
2570 cpu_pause();
2571 state = ordered_load_mtx_state(mutex);
2572 }
2573 *new_state = state;
2574 return;
2575 }
2576
2577 static inline void
2578 lck_mtx_interlock_lock_clear_flags(
2579 lck_mtx_t *mutex,
2580 uint32_t and_flags,
2581 uint32_t *new_state)
2582 {
2583 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2584 }
2585
2586 static inline void
2587 lck_mtx_interlock_lock(
2588 lck_mtx_t *mutex,
2589 uint32_t *new_state)
2590 {
2591 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2592 }
2593
2594 static inline int
2595 lck_mtx_interlock_try_lock_set_flags(
2596 lck_mtx_t *mutex,
2597 uint32_t or_flags,
2598 uint32_t *new_state)
2599 {
2600 uint32_t state, prev;
2601 state = *new_state;
2602
2603 /* have to wait for interlock to clear */
2604 if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2605 return 0;
2606 }
2607 prev = state; /* prev contains snapshot for exchange */
2608 state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */
2609 disable_preemption();
2610 if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
2611 *new_state = state;
2612 return 1;
2613 }
2614
2615 enable_preemption();
2616 return 0;
2617 }
2618
2619 static inline int
2620 lck_mtx_interlock_try_lock(
2621 lck_mtx_t *mutex,
2622 uint32_t *new_state)
2623 {
2624 return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state);
2625 }
2626
2627 static inline int
2628 lck_mtx_interlock_try_lock_disable_interrupts(
2629 lck_mtx_t *mutex,
2630 boolean_t *istate)
2631 {
2632 uint32_t state;
2633
2634 *istate = ml_set_interrupts_enabled(FALSE);
2635 state = ordered_load_mtx_state(mutex);
2636
2637 if (lck_mtx_interlock_try_lock(mutex, &state)) {
2638 return 1;
2639 } else {
2640 ml_set_interrupts_enabled(*istate);
2641 return 0;
2642 }
2643 }
2644
2645 static inline void
2646 lck_mtx_interlock_unlock_enable_interrupts(
2647 lck_mtx_t *mutex,
2648 boolean_t istate)
2649 {
2650 lck_mtx_ilk_unlock(mutex);
2651 ml_set_interrupts_enabled(istate);
2652 }
2653
2654 static void __inline__
2655 lck_mtx_inc_stats(
2656 uint64_t* stat)
2657 {
2658 #if ATOMIC_STAT_UPDATES
2659 os_atomic_inc(stat, relaxed);
2660 #else
2661 *stat = (*stat)++;
2662 #endif
2663 }
2664
2665 static void __inline__
2666 lck_mtx_update_miss(
2667 struct _lck_mtx_ext_ *lock,
2668 int *first_miss)
2669 {
2670 #if LOG_FIRST_MISS_ALONE
2671 if ((*first_miss & 1) == 0) {
2672 #else
2673 #pragma unused(first_miss)
2674 #endif
2675 uint64_t* stat = &lock->lck_mtx_grp->lck_grp_miss;
2676 lck_mtx_inc_stats(stat);
2677
2678 #if LOG_FIRST_MISS_ALONE
2679 *first_miss |= 1;
2680 }
2681 #endif
2682 }
2683
2684 static void __inline__
2685 lck_mtx_update_direct_wait(
2686 struct _lck_mtx_ext_ *lock)
2687 {
2688 uint64_t* stat = &lock->lck_mtx_grp->lck_grp_direct_wait;
2689 lck_mtx_inc_stats(stat);
2690 }
2691
2692 static void __inline__
2693 lck_mtx_update_wait(
2694 struct _lck_mtx_ext_ *lock,
2695 int *first_miss)
2696 {
2697 #if LOG_FIRST_MISS_ALONE
2698 if ((*first_miss & 2) == 0) {
2699 #else
2700 #pragma unused(first_miss)
2701 #endif
2702 uint64_t* stat = &lock->lck_mtx_grp->lck_grp_wait;
2703 lck_mtx_inc_stats(stat);
2704
2705 #if LOG_FIRST_MISS_ALONE
2706 *first_miss |= 2;
2707 }
2708 #endif
2709 }
2710
2711 static void __inline__
2712 lck_mtx_update_util(
2713 struct _lck_mtx_ext_ *lock)
2714 {
2715 uint64_t* stat = &lock->lck_mtx_grp->lck_grp_util;
2716 lck_mtx_inc_stats(stat);
2717 }
2718
2719 __attribute__((noinline))
2720 static void
2721 lck_mtx_lock_contended(
2722 lck_mtx_t *lock,
2723 boolean_t indirect,
2724 boolean_t *first_miss)
2725 {
2726 lck_mtx_spinwait_ret_type_t ret;
2727 uint32_t state;
2728 thread_t thread;
2729
2730 try_again:
2731
2732 if (indirect) {
2733 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
2734 }
2735
2736 ret = lck_mtx_lock_spinwait_x86(lock);
2737 state = ordered_load_mtx_state(lock);
2738 switch (ret) {
2739 case LCK_MTX_SPINWAIT_NO_SPIN:
2740 /*
2741 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2742 * try to spin.
2743 */
2744 if (indirect) {
2745 lck_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
2746 }
2747
2748 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2749 case LCK_MTX_SPINWAIT_SPUN:
2750 /*
2751 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2752 * interlock not held
2753 */
2754 lck_mtx_interlock_lock(lock, &state);
2755 assert(state & LCK_MTX_ILOCKED_MSK);
2756
2757 if (state & LCK_MTX_MLOCKED_MSK) {
2758 if (indirect) {
2759 lck_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
2760 }
2761 lck_mtx_lock_wait_x86(lock);
2762 /*
2763 * interlock is not held here.
2764 */
2765 goto try_again;
2766 } else {
2767
2768 /* grab the mutex */
2769 state |= LCK_MTX_MLOCKED_MSK;
2770 ordered_store_mtx_state_release(lock, state);
2771 thread = current_thread();
2772 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2773 #if MACH_LDEBUG
2774 if (thread) {
2775 thread->mutex_count++;
2776 }
2777 #endif /* MACH_LDEBUG */
2778 }
2779
2780 break;
2781 case LCK_MTX_SPINWAIT_ACQUIRED:
2782 /*
2783 * mutex has been acquired by lck_mtx_lock_spinwait_x86
2784 * interlock is held and preemption disabled
2785 * owner is set and mutex marked as locked
2786 * statistics updated too
2787 */
2788 break;
2789 default:
2790 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2791 }
2792
2793 /*
2794 * interlock is already acquired here
2795 */
2796
2797 /* mutex has been acquired */
2798 thread = (thread_t)lock->lck_mtx_owner;
2799 if (state & LCK_MTX_WAITERS_MSK || thread->was_promoted_on_wakeup) {
2800 return lck_mtx_lock_acquire_tail(lock, indirect);
2801 }
2802
2803 /* release the interlock */
2804 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2805 }
2806
2807 /*
2808 * Helper noinline functions for calling
2809 * panic to optimize compiled code.
2810 */
2811
2812 __attribute__((noinline))
2813 static void
2814 lck_mtx_destroyed(
2815 lck_mtx_t *lock)
2816 {
2817 panic("trying to interlock destroyed mutex (%p)", lock);
2818 }
2819
2820 __attribute__((noinline))
2821 static boolean_t
2822 lck_mtx_try_destroyed(
2823 lck_mtx_t *lock)
2824 {
2825 panic("trying to interlock destroyed mutex (%p)", lock);
2826 return FALSE;
2827 }
2828
2829 __attribute__((always_inline))
2830 static boolean_t
2831 lck_mtx_lock_wait_interlock_to_clear(
2832 lck_mtx_t *lock,
2833 uint32_t* new_state)
2834 {
2835 uint32_t state;
2836
2837 for ( ; ; ) {
2838 cpu_pause();
2839 state = ordered_load_mtx_state(lock);
2840 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2841 *new_state = state;
2842 return TRUE;
2843 }
2844 if (state & LCK_MTX_MLOCKED_MSK) {
2845 /* if it is held as mutex, just fail */
2846 return FALSE;
2847 }
2848 }
2849 }
2850
2851 __attribute__((always_inline))
2852 static boolean_t
2853 lck_mtx_try_lock_wait_interlock_to_clear(
2854 lck_mtx_t *lock,
2855 uint32_t* new_state)
2856 {
2857 uint32_t state;
2858
2859 for ( ; ; ) {
2860 cpu_pause();
2861 state = ordered_load_mtx_state(lock);
2862 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2863 /* if it is held as mutex or spin, just fail */
2864 return FALSE;
2865 }
2866 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2867 *new_state = state;
2868 return TRUE;
2869 }
2870 }
2871 }
2872
2873 /*
2874 * Routine: lck_mtx_lock_slow
2875 *
2876 * Locks a mutex for current thread.
2877 * If the lock is contended this function might
2878 * sleep.
2879 *
2880 * Called with interlock not held.
2881 */
2882 __attribute__((noinline))
2883 void
2884 lck_mtx_lock_slow(
2885 lck_mtx_t *lock)
2886 {
2887 boolean_t indirect = FALSE;
2888 uint32_t state;
2889 int first_miss = 0;
2890
2891 state = ordered_load_mtx_state(lock);
2892
2893 /* is the interlock or mutex held */
2894 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2895 /*
2896 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2897 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2898 * set in state (state == lck_mtx_tag)
2899 */
2900
2901
2902 /* is the mutex already held and not indirect */
2903 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2904 /* no, must have been the mutex */
2905 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2906 }
2907
2908 /* check to see if it is marked destroyed */
2909 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2910 return lck_mtx_destroyed(lock);
2911 }
2912
2913 /* Is this an indirect mutex? */
2914 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2915 indirect = get_indirect_mutex(&lock, &state);
2916
2917 first_miss = 0;
2918 lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
2919
2920 if (state & LCK_MTX_SPIN_MSK) {
2921 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2922 assert(state & LCK_MTX_ILOCKED_MSK);
2923 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2924 }
2925 }
2926
2927 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2928 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2929 }
2930 }
2931
2932 /* no - can't be INDIRECT, DESTROYED or locked */
2933 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2934 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2935 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2936 }
2937 }
2938
2939 /* lock and interlock acquired */
2940
2941 thread_t thread = current_thread();
2942 /* record owner of mutex */
2943 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2944
2945 #if MACH_LDEBUG
2946 if (thread) {
2947 thread->mutex_count++; /* lock statistic */
2948 }
2949 #endif
2950 /*
2951 * Check if there are waiters to
2952 * inherit their priority.
2953 */
2954 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2955 return lck_mtx_lock_acquire_tail(lock, indirect);
2956 }
2957
2958 /* release the interlock */
2959 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2960
2961 return;
2962 }
2963
2964 __attribute__((noinline))
2965 boolean_t
2966 lck_mtx_try_lock_slow(
2967 lck_mtx_t *lock)
2968 {
2969 boolean_t indirect = FALSE;
2970 uint32_t state;
2971 int first_miss = 0;
2972
2973 state = ordered_load_mtx_state(lock);
2974
2975 /* is the interlock or mutex held */
2976 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2977 /*
2978 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2979 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2980 * set in state (state == lck_mtx_tag)
2981 */
2982
2983 /* is the mutex already held and not indirect */
2984 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2985 return FALSE;
2986 }
2987
2988 /* check to see if it is marked destroyed */
2989 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2990 return lck_mtx_try_destroyed(lock);
2991 }
2992
2993 /* Is this an indirect mutex? */
2994 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2995 indirect = get_indirect_mutex(&lock, &state);
2996
2997 first_miss = 0;
2998 lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
2999 }
3000
3001 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3002 if (indirect)
3003 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3004 return FALSE;
3005 }
3006 }
3007
3008 /* no - can't be INDIRECT, DESTROYED or locked */
3009 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
3010 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3011 if (indirect)
3012 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3013 return FALSE;
3014 }
3015 }
3016
3017 /* lock and interlock acquired */
3018
3019 thread_t thread = current_thread();
3020 /* record owner of mutex */
3021 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3022
3023 #if MACH_LDEBUG
3024 if (thread) {
3025 thread->mutex_count++; /* lock statistic */
3026 }
3027 #endif
3028 /*
3029 * Check if there are waiters to
3030 * inherit their priority.
3031 */
3032 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3033 return lck_mtx_try_lock_acquire_tail(lock);
3034 }
3035
3036 /* release the interlock */
3037 lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
3038
3039 return TRUE;
3040
3041 }
3042
3043 __attribute__((noinline))
3044 void
3045 lck_mtx_lock_spin_slow(
3046 lck_mtx_t *lock)
3047 {
3048 boolean_t indirect = FALSE;
3049 uint32_t state;
3050 int first_miss = 0;
3051
3052 state = ordered_load_mtx_state(lock);
3053
3054 /* is the interlock or mutex held */
3055 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3056 /*
3057 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3058 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3059 * set in state (state == lck_mtx_tag)
3060 */
3061
3062
3063 /* is the mutex already held and not indirect */
3064 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
3065 /* no, must have been the mutex */
3066 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3067 }
3068
3069 /* check to see if it is marked destroyed */
3070 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3071 return lck_mtx_destroyed(lock);
3072 }
3073
3074 /* Is this an indirect mutex? */
3075 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3076 indirect = get_indirect_mutex(&lock, &state);
3077
3078 first_miss = 0;
3079 lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
3080
3081 if (state & LCK_MTX_SPIN_MSK) {
3082 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3083 assert(state & LCK_MTX_ILOCKED_MSK);
3084 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3085 }
3086 }
3087
3088 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3089 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3090 }
3091 }
3092
3093 /* no - can't be INDIRECT, DESTROYED or locked */
3094 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) {
3095 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3096 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3097 }
3098 }
3099
3100 /* lock as spinlock and interlock acquired */
3101
3102 thread_t thread = current_thread();
3103 /* record owner of mutex */
3104 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3105
3106 #if MACH_LDEBUG
3107 if (thread) {
3108 thread->mutex_count++; /* lock statistic */
3109 }
3110 #endif
3111
3112 #if CONFIG_DTRACE
3113 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3114 #endif
3115 /* return with the interlock held and preemption disabled */
3116 return;
3117 }
3118
3119 __attribute__((noinline))
3120 boolean_t
3121 lck_mtx_try_lock_spin_slow(
3122 lck_mtx_t *lock)
3123 {
3124 boolean_t indirect = FALSE;
3125 uint32_t state;
3126 int first_miss = 0;
3127
3128 state = ordered_load_mtx_state(lock);
3129
3130 /* is the interlock or mutex held */
3131 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3132 /*
3133 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3134 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3135 * set in state (state == lck_mtx_tag)
3136 */
3137
3138 /* is the mutex already held and not indirect */
3139 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
3140 return FALSE;
3141 }
3142
3143 /* check to see if it is marked destroyed */
3144 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3145 return lck_mtx_try_destroyed(lock);
3146 }
3147
3148 /* Is this an indirect mutex? */
3149 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3150 indirect = get_indirect_mutex(&lock, &state);
3151
3152 first_miss = 0;
3153 lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
3154 }
3155
3156 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3157 if (indirect)
3158 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3159 return FALSE;
3160 }
3161 }
3162
3163 /* no - can't be INDIRECT, DESTROYED or locked */
3164 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3165 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3166 if (indirect)
3167 lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3168 return FALSE;
3169 }
3170 }
3171
3172 /* lock and interlock acquired */
3173
3174 thread_t thread = current_thread();
3175 /* record owner of mutex */
3176 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3177
3178 #if MACH_LDEBUG
3179 if (thread) {
3180 thread->mutex_count++; /* lock statistic */
3181 }
3182 #endif
3183
3184 #if CONFIG_DTRACE
3185 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3186 #endif
3187 return TRUE;
3188
3189 }
3190
3191 __attribute__((noinline))
3192 void
3193 lck_mtx_convert_spin(
3194 lck_mtx_t *lock)
3195 {
3196 uint32_t state;
3197
3198 state = ordered_load_mtx_state(lock);
3199
3200 /* Is this an indirect mutex? */
3201 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3202 /* If so, take indirection */
3203 get_indirect_mutex(&lock, &state);
3204 }
3205
3206 assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3207
3208 if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3209 /* already owned as a mutex, just return */
3210 return;
3211 }
3212
3213 assert(get_preemption_level() > 0);
3214 assert(state & LCK_MTX_ILOCKED_MSK);
3215 assert(state & LCK_MTX_SPIN_MSK);
3216
3217 /*
3218 * Check if there are waiters to
3219 * inherit their priority.
3220 */
3221 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3222 return lck_mtx_convert_spin_acquire_tail(lock);
3223 }
3224
3225 lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3226
3227 return;
3228 }
3229
3230 static inline boolean_t
3231 lck_mtx_lock_grab_mutex(
3232 lck_mtx_t *lock)
3233 {
3234 uint32_t state;
3235
3236 state = ordered_load_mtx_state(lock);
3237
3238 if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3239 return FALSE;
3240 }
3241
3242 /* lock and interlock acquired */
3243
3244 thread_t thread = current_thread();
3245 /* record owner of mutex */
3246 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3247
3248 #if MACH_LDEBUG
3249 if (thread) {
3250 thread->mutex_count++; /* lock statistic */
3251 }
3252 #endif
3253 return TRUE;
3254 }
3255
3256 __attribute__((noinline))
3257 void
3258 lck_mtx_assert(
3259 lck_mtx_t *lock,
3260 unsigned int type)
3261 {
3262 thread_t thread, owner;
3263 uint32_t state;
3264
3265 thread = current_thread();
3266 state = ordered_load_mtx_state(lock);
3267
3268 if (state == LCK_MTX_TAG_INDIRECT) {
3269 get_indirect_mutex(&lock, &state);
3270 }
3271
3272 owner = (thread_t)lock->lck_mtx_owner;
3273
3274 if (type == LCK_MTX_ASSERT_OWNED) {
3275 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))
3276 panic("mutex (%p) not owned\n", lock);
3277 } else {
3278 assert (type == LCK_MTX_ASSERT_NOTOWNED);
3279 if (owner == thread)
3280 panic("mutex (%p) owned\n", lock);
3281 }
3282 }
3283
3284 /*
3285 * Routine: lck_mtx_lock_spinwait_x86
3286 *
3287 * Invoked trying to acquire a mutex when there is contention but
3288 * the holder is running on another processor. We spin for up to a maximum
3289 * time waiting for the lock to be released.
3290 *
3291 * Called with the interlock unlocked.
3292 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3293 * returns LCK_MTX_SPINWAIT_SPUN if we spun
3294 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3295 */
3296 __attribute__((noinline))
3297 lck_mtx_spinwait_ret_type_t
3298 lck_mtx_lock_spinwait_x86(
3299 lck_mtx_t *mutex)
3300 {
3301 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3302 thread_t holder;
3303 uint64_t overall_deadline;
3304 uint64_t check_owner_deadline;
3305 uint64_t cur_time;
3306 lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN;
3307 int loopcount = 0;
3308
3309 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
3310 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
3311
3312 cur_time = mach_absolute_time();
3313 overall_deadline = cur_time + MutexSpin;
3314 check_owner_deadline = cur_time;
3315
3316 /*
3317 * Spin while:
3318 * - mutex is locked, and
3319 * - its locked as a spin lock, and
3320 * - owner is running on another processor, and
3321 * - owner (processor) is not idling, and
3322 * - we haven't spun for long enough.
3323 */
3324 do {
3325 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
3326 retval = LCK_MTX_SPINWAIT_ACQUIRED;
3327 break;
3328 }
3329 cur_time = mach_absolute_time();
3330
3331 if (cur_time >= overall_deadline)
3332 break;
3333
3334 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
3335 boolean_t istate;
3336
3337 /*
3338 * We will repeatedly peek at the state of the lock while spinning,
3339 * and we will acquire the interlock to do so.
3340 * The thread that will unlock the mutex will also need to acquire
3341 * the interlock, and we want to avoid to slow it down.
3342 * To avoid to get an interrupt while holding the interlock
3343 * and increase the time we are holding it, we
3344 * will try to acquire the interlock with interrupts disabled.
3345 * This is safe because it is a "try_lock", if we can't acquire
3346 * the interlock we re-enable the interrupts and fail, so it is
3347 * ok to call it even if the interlock was already held.
3348 */
3349 if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) {
3350
3351 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
3352
3353 if ( !(holder->machine.specFlags & OnProc) ||
3354 (holder->state & TH_IDLE)) {
3355
3356 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3357
3358 if (loopcount == 0)
3359 retval = LCK_MTX_SPINWAIT_NO_SPIN;
3360 break;
3361 }
3362 }
3363 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3364
3365 check_owner_deadline = cur_time + (MutexSpin / 4);
3366 }
3367 }
3368 cpu_pause();
3369
3370 loopcount++;
3371
3372 } while (TRUE);
3373
3374 #if CONFIG_DTRACE
3375 /*
3376 * We've already kept a count via overall_deadline of how long we spun.
3377 * If dtrace is active, then we compute backwards to decide how
3378 * long we spun.
3379 *
3380 * Note that we record a different probe id depending on whether
3381 * this is a direct or indirect mutex. This allows us to
3382 * penalize only lock groups that have debug/stats enabled
3383 * with dtrace processing if desired.
3384 */
3385 if (__probable(mutex->lck_mtx_is_ext == 0)) {
3386 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3387 mach_absolute_time() - (overall_deadline - MutexSpin));
3388 } else {
3389 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3390 mach_absolute_time() - (overall_deadline - MutexSpin));
3391 }
3392 /* The lockstat acquire event is recorded by the assembly code beneath us. */
3393 #endif
3394
3395 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
3396 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
3397
3398 return retval;
3399 }
3400
3401
3402
3403 /*
3404 * Routine: lck_mtx_lock_wait_x86
3405 *
3406 * Invoked in order to wait on contention.
3407 *
3408 * Called with the interlock locked and
3409 * preemption disabled...
3410 * returns it unlocked and with preemption enabled
3411 *
3412 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3413 * A runnable waiter can exist between wait and acquire
3414 * without a waiters count being set.
3415 * This allows us to never make a spurious wakeup call.
3416 *
3417 * Priority:
3418 * This avoids taking the thread lock if the owning thread is the same priority.
3419 * This optimizes the case of same-priority threads contending on a lock.
3420 * However, that allows the owning thread to drop in priority while holding the lock,
3421 * because there is no state that the priority change can notice that
3422 * says that the targeted thread holds a contended mutex.
3423 *
3424 * One possible solution: priority changes could look for some atomic tag
3425 * on the thread saying 'holding contended lock', and then set up a promotion.
3426 * Needs a story for dropping that promotion - the last contended unlock
3427 * has to notice that this has happened.
3428 */
3429 __attribute__((noinline))
3430 void
3431 lck_mtx_lock_wait_x86 (
3432 lck_mtx_t *mutex)
3433 {
3434 #if CONFIG_DTRACE
3435 uint64_t sleep_start = 0;
3436
3437 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3438 sleep_start = mach_absolute_time();
3439 }
3440 #endif
3441 thread_t self = current_thread();
3442 assert(self->waiting_for_mutex == NULL);
3443
3444 self->waiting_for_mutex = mutex;
3445
3446 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3447
3448 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
3449 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3450 mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
3451
3452 integer_t waiter_pri = self->sched_pri;
3453 waiter_pri = MAX(waiter_pri, self->base_pri);
3454 waiter_pri = MAX(waiter_pri, BASEPRI_DEFAULT);
3455 waiter_pri = MIN(waiter_pri, MAXPRI_PROMOTE);
3456
3457 assert(mutex->lck_mtx_pri <= MAXPRI_PROMOTE);
3458
3459 /* Re-initialize lck_mtx_pri if this is the first contention */
3460 if (mutex->lck_mtx_waiters == 0 || mutex->lck_mtx_pri <= waiter_pri)
3461 mutex->lck_mtx_pri = waiter_pri;
3462
3463 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3464
3465 assert(holder != NULL);
3466
3467 /*
3468 * Intel only causes a promotion when priority needs to change,
3469 * reducing thread lock holds but leaving us vulnerable to the holder
3470 * dropping priority.
3471 */
3472 if (holder->sched_pri < mutex->lck_mtx_pri) {
3473 int promote_pri = mutex->lck_mtx_pri;
3474
3475 spl_t s = splsched();
3476 thread_lock(holder);
3477
3478 /* Check again in case sched_pri changed */
3479 if (holder->sched_pri < promote_pri && holder->promotion_priority < promote_pri) {
3480 if (mutex->lck_mtx_promoted == 0) {
3481 /* This is the first promotion for this mutex */
3482 mutex->lck_mtx_promoted = 1;
3483
3484 if (holder->promotions++ == 0) {
3485 /* This is the first promotion for holder */
3486 sched_thread_promote_to_pri(holder, promote_pri, trace_lck);
3487 } else {
3488 /*
3489 * Holder was previously promoted due to a different mutex,
3490 * check if it needs to raise to match this one
3491 */
3492 sched_thread_update_promotion_to_pri(holder, promote_pri,
3493 trace_lck);
3494 }
3495 } else {
3496 /*
3497 * Holder was previously promoted due to this mutex,
3498 * check if the pri needs to go up
3499 */
3500 sched_thread_update_promotion_to_pri(holder, promote_pri, trace_lck);
3501 }
3502 }
3503
3504 thread_unlock(holder);
3505 splx(s);
3506 }
3507
3508 mutex->lck_mtx_waiters++;
3509
3510 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3511 assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
3512
3513 lck_mtx_ilk_unlock(mutex);
3514
3515 thread_block(THREAD_CONTINUE_NULL);
3516
3517 self->waiting_for_mutex = NULL;
3518
3519 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
3520 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3521 mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
3522
3523 #if CONFIG_DTRACE
3524 /*
3525 * Record the Dtrace lockstat probe for blocking, block time
3526 * measured from when we were entered.
3527 */
3528 if (sleep_start) {
3529 if (mutex->lck_mtx_is_ext == 0) {
3530 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3531 mach_absolute_time() - sleep_start);
3532 } else {
3533 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3534 mach_absolute_time() - sleep_start);
3535 }
3536 }
3537 #endif
3538 }
3539
3540 /*
3541 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3542 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3543 * Returns: TRUE if lock is acquired.
3544 */
3545 boolean_t
3546 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3547 {
3548 if (not_in_kdp) {
3549 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3550 }
3551
3552 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3553 return TRUE;
3554 }
3555
3556 return FALSE;
3557 }
3558
3559 void
3560 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3561 {
3562 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3563 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3564 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3565 waitinfo->owner = thread_tid(holder);
3566 }
3567
3568 void
3569 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3570 {
3571 lck_rw_t *rwlck = NULL;
3572 switch(waitinfo->wait_type) {
3573 case kThreadWaitKernelRWLockRead:
3574 rwlck = READ_EVENT_TO_RWLOCK(event);
3575 break;
3576 case kThreadWaitKernelRWLockWrite:
3577 case kThreadWaitKernelRWLockUpgrade:
3578 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3579 break;
3580 default:
3581 panic("%s was called with an invalid blocking type", __FUNCTION__);
3582 break;
3583 }
3584 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3585 waitinfo->owner = 0;
3586 }