]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/locks_i386.c
xnu-3789.60.24.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
64 #include <mach_ldebug.h>
65
66 #include <kern/locks.h>
67 #include <kern/kalloc.h>
68 #include <kern/misc_protos.h>
69 #include <kern/thread.h>
70 #include <kern/processor.h>
71 #include <kern/cpu_data.h>
72 #include <kern/cpu_number.h>
73 #include <kern/sched_prim.h>
74 #include <kern/xpr.h>
75 #include <kern/debug.h>
76 #include <string.h>
77
78 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
79 #include <machine/machine_cpu.h>
80 #include <i386/mp.h>
81
82 #include <sys/kdebug.h>
83 #include <mach/branch_predicates.h>
84
85 /*
86 * We need only enough declarations from the BSD-side to be able to
87 * test if our probe is active, and to call __dtrace_probe(). Setting
88 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
89 */
90 #if CONFIG_DTRACE
91 #define NEED_DTRACE_DEFS
92 #include <../bsd/sys/lockstat.h>
93 #endif
94
95 #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
96 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
97 #define LCK_RW_LCK_SHARED_CODE 0x102
98 #define LCK_RW_LCK_SH_TO_EX_CODE 0x103
99 #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
100 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105
101
102 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
103 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
104 #define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
105 #define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
106 #define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
107 #define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
108 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
109 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
110
111
112 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
113
114 unsigned int LcksOpts=0;
115
116 /* Forwards */
117
118 #if USLOCK_DEBUG
119 /*
120 * Perform simple lock checks.
121 */
122 int uslock_check = 1;
123 int max_lock_loops = 100000000;
124 decl_simple_lock_data(extern , printf_lock)
125 decl_simple_lock_data(extern , panic_lock)
126 #endif /* USLOCK_DEBUG */
127
128 extern unsigned int not_in_kdp;
129 extern void kdp_lck_mtx_find_owner(
130 struct waitq * waitq,
131 event64_t event,
132 thread_waitinfo_t * waitinfo);
133
134 extern void kdp_rwlck_find_owner(
135 struct waitq * waitq,
136 event64_t event,
137 thread_waitinfo_t * waitinfo);
138
139 /*
140 * We often want to know the addresses of the callers
141 * of the various lock routines. However, this information
142 * is only used for debugging and statistics.
143 */
144 typedef void *pc_t;
145 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
146 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
147 #if ANY_LOCK_DEBUG
148 #define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
149 #define DECL_PC(pc) pc_t pc;
150 #else /* ANY_LOCK_DEBUG */
151 #define DECL_PC(pc)
152 #ifdef lint
153 /*
154 * Eliminate lint complaints about unused local pc variables.
155 */
156 #define OBTAIN_PC(pc) ++pc
157 #else /* lint */
158 #define OBTAIN_PC(pc)
159 #endif /* lint */
160 #endif /* USLOCK_DEBUG */
161
162
163 /*
164 * Portable lock package implementation of usimple_locks.
165 */
166
167 #if USLOCK_DEBUG
168 #define USLDBG(stmt) stmt
169 void usld_lock_init(usimple_lock_t, unsigned short);
170 void usld_lock_pre(usimple_lock_t, pc_t);
171 void usld_lock_post(usimple_lock_t, pc_t);
172 void usld_unlock(usimple_lock_t, pc_t);
173 void usld_lock_try_pre(usimple_lock_t, pc_t);
174 void usld_lock_try_post(usimple_lock_t, pc_t);
175 int usld_lock_common_checks(usimple_lock_t, char *);
176 #else /* USLOCK_DEBUG */
177 #define USLDBG(stmt)
178 #endif /* USLOCK_DEBUG */
179
180
181 extern int lck_rw_grab_want(lck_rw_t *lck);
182 extern int lck_rw_grab_shared(lck_rw_t *lck);
183 extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
184
185
186 /*
187 * Forward definitions
188 */
189
190 void lck_rw_lock_shared_gen(
191 lck_rw_t *lck);
192
193 void lck_rw_lock_exclusive_gen(
194 lck_rw_t *lck);
195
196 boolean_t lck_rw_lock_shared_to_exclusive_success(
197 lck_rw_t *lck);
198
199 boolean_t lck_rw_lock_shared_to_exclusive_failure(
200 lck_rw_t *lck,
201 int prior_lock_state);
202
203 void lck_rw_lock_exclusive_to_shared_gen(
204 lck_rw_t *lck,
205 int prior_lock_state);
206
207 lck_rw_type_t lck_rw_done_gen(
208 lck_rw_t *lck,
209 int prior_lock_state);
210
211 void lck_rw_clear_promotions_x86(thread_t thread);
212
213 /*
214 * Routine: lck_spin_alloc_init
215 */
216 lck_spin_t *
217 lck_spin_alloc_init(
218 lck_grp_t *grp,
219 lck_attr_t *attr)
220 {
221 lck_spin_t *lck;
222
223 if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
224 lck_spin_init(lck, grp, attr);
225
226 return(lck);
227 }
228
229 /*
230 * Routine: lck_spin_free
231 */
232 void
233 lck_spin_free(
234 lck_spin_t *lck,
235 lck_grp_t *grp)
236 {
237 lck_spin_destroy(lck, grp);
238 kfree(lck, sizeof(lck_spin_t));
239 }
240
241 /*
242 * Routine: lck_spin_init
243 */
244 void
245 lck_spin_init(
246 lck_spin_t *lck,
247 lck_grp_t *grp,
248 __unused lck_attr_t *attr)
249 {
250 usimple_lock_init((usimple_lock_t) lck, 0);
251 lck_grp_reference(grp);
252 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
253 }
254
255 /*
256 * Routine: lck_spin_destroy
257 */
258 void
259 lck_spin_destroy(
260 lck_spin_t *lck,
261 lck_grp_t *grp)
262 {
263 if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
264 return;
265 lck->interlock = LCK_SPIN_TAG_DESTROYED;
266 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
267 lck_grp_deallocate(grp);
268 return;
269 }
270
271 /*
272 * Routine: lck_spin_lock
273 */
274 void
275 lck_spin_lock(
276 lck_spin_t *lck)
277 {
278 usimple_lock((usimple_lock_t) lck);
279 }
280
281 /*
282 * Routine: lck_spin_unlock
283 */
284 void
285 lck_spin_unlock(
286 lck_spin_t *lck)
287 {
288 usimple_unlock((usimple_lock_t) lck);
289 }
290
291
292 /*
293 * Routine: lck_spin_try_lock
294 */
295 boolean_t
296 lck_spin_try_lock(
297 lck_spin_t *lck)
298 {
299 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
300 #if DEVELOPMENT || DEBUG
301 if (lrval) {
302 pltrace(FALSE);
303 }
304 #endif
305 return(lrval);
306 }
307
308 /*
309 * Routine: lck_spin_assert
310 */
311 void
312 lck_spin_assert(lck_spin_t *lock, unsigned int type)
313 {
314 thread_t thread, holder;
315 uintptr_t state;
316
317 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
318 panic("lck_spin_assert(): invalid arg (%u)", type);
319 }
320
321 state = lock->interlock;
322 holder = (thread_t)state;
323 thread = current_thread();
324 if (type == LCK_ASSERT_OWNED) {
325 if (__improbable(holder == THREAD_NULL)) {
326 panic("Lock not owned %p = %lx", lock, state);
327 }
328 if (__improbable(holder != thread)) {
329 panic("Lock not owned by current thread %p = %lx", lock, state);
330 }
331 } else if (type == LCK_ASSERT_NOTOWNED) {
332 if (__improbable(holder != THREAD_NULL)) {
333 if (holder == thread) {
334 panic("Lock owned by current thread %p = %lx", lock, state);
335 } else {
336 panic("Lock %p owned by thread %p", lock, holder);
337 }
338 }
339 }
340 }
341
342 /*
343 * Routine: kdp_lck_spin_is_acquired
344 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
345 * Returns: TRUE if lock is acquired.
346 */
347 boolean_t
348 kdp_lck_spin_is_acquired(lck_spin_t *lck) {
349 if (not_in_kdp) {
350 panic("panic: spinlock acquired check done outside of kernel debugger");
351 }
352 return (lck->interlock != 0)? TRUE : FALSE;
353 }
354
355 /*
356 * Initialize a usimple_lock.
357 *
358 * No change in preemption state.
359 */
360 void
361 usimple_lock_init(
362 usimple_lock_t l,
363 __unused unsigned short tag)
364 {
365 #ifndef MACHINE_SIMPLE_LOCK
366 USLDBG(usld_lock_init(l, tag));
367 hw_lock_init(&l->interlock);
368 #else
369 simple_lock_init((simple_lock_t)l,tag);
370 #endif
371 }
372
373 volatile uint32_t spinlock_owner_cpu = ~0;
374 volatile usimple_lock_t spinlock_timed_out;
375
376 uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
377 uint64_t deadline;
378 uint32_t i;
379
380 for (i = 0; i < real_ncpus; i++) {
381 if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
382 spinlock_owner_cpu = i;
383 if ((uint32_t) cpu_number() == i)
384 break;
385 cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
386 cpu_NMI_interrupt(i);
387 deadline = mach_absolute_time() + (LockTimeOut * 2);
388 while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
389 cpu_pause();
390 break;
391 }
392 }
393
394 return spinlock_owner_cpu;
395 }
396
397 /*
398 * Acquire a usimple_lock.
399 *
400 * Returns with preemption disabled. Note
401 * that the hw_lock routines are responsible for
402 * maintaining preemption state.
403 */
404 void
405 usimple_lock(
406 usimple_lock_t l)
407 {
408 #ifndef MACHINE_SIMPLE_LOCK
409 DECL_PC(pc);
410
411 OBTAIN_PC(pc);
412 USLDBG(usld_lock_pre(l, pc));
413
414 if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0)) {
415 boolean_t uslock_acquired = FALSE;
416 while (machine_timeout_suspended()) {
417 enable_preemption();
418 if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
419 break;
420 }
421
422 if (uslock_acquired == FALSE) {
423 uint32_t lock_cpu;
424 uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
425 spinlock_timed_out = l;
426 lock_cpu = spinlock_timeout_NMI(lowner);
427 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
428 }
429 }
430 #if DEVELOPMENT || DEBUG
431 pltrace(FALSE);
432 #endif
433
434 USLDBG(usld_lock_post(l, pc));
435 #else
436 simple_lock((simple_lock_t)l);
437 #endif
438 }
439
440
441 /*
442 * Release a usimple_lock.
443 *
444 * Returns with preemption enabled. Note
445 * that the hw_lock routines are responsible for
446 * maintaining preemption state.
447 */
448 void
449 usimple_unlock(
450 usimple_lock_t l)
451 {
452 #ifndef MACHINE_SIMPLE_LOCK
453 DECL_PC(pc);
454
455 OBTAIN_PC(pc);
456 USLDBG(usld_unlock(l, pc));
457 #if DEVELOPMENT || DEBUG
458 pltrace(TRUE);
459 #endif
460 hw_lock_unlock(&l->interlock);
461 #else
462 simple_unlock_rwmb((simple_lock_t)l);
463 #endif
464 }
465
466
467 /*
468 * Conditionally acquire a usimple_lock.
469 *
470 * On success, returns with preemption disabled.
471 * On failure, returns with preemption in the same state
472 * as when first invoked. Note that the hw_lock routines
473 * are responsible for maintaining preemption state.
474 *
475 * XXX No stats are gathered on a miss; I preserved this
476 * behavior from the original assembly-language code, but
477 * doesn't it make sense to log misses? XXX
478 */
479 unsigned int
480 usimple_lock_try(
481 usimple_lock_t l)
482 {
483 #ifndef MACHINE_SIMPLE_LOCK
484 unsigned int success;
485 DECL_PC(pc);
486
487 OBTAIN_PC(pc);
488 USLDBG(usld_lock_try_pre(l, pc));
489 if ((success = hw_lock_try(&l->interlock))) {
490 #if DEVELOPMENT || DEBUG
491 pltrace(FALSE);
492 #endif
493 USLDBG(usld_lock_try_post(l, pc));
494 }
495 return success;
496 #else
497 return(simple_lock_try((simple_lock_t)l));
498 #endif
499 }
500
501 /*
502 * Acquire a usimple_lock while polling for pending TLB flushes
503 * and spinning on a lock.
504 *
505 */
506 void
507 usimple_lock_try_lock_loop(usimple_lock_t l)
508 {
509 boolean_t istate = ml_get_interrupts_enabled();
510 while (!simple_lock_try((l))) {
511 if (!istate)
512 handle_pending_TLB_flushes();
513 cpu_pause();
514 }
515 }
516
517 #if USLOCK_DEBUG
518 /*
519 * States of a usimple_lock. The default when initializing
520 * a usimple_lock is setting it up for debug checking.
521 */
522 #define USLOCK_CHECKED 0x0001 /* lock is being checked */
523 #define USLOCK_TAKEN 0x0002 /* lock has been taken */
524 #define USLOCK_INIT 0xBAA0 /* lock has been initialized */
525 #define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
526 #define USLOCK_CHECKING(l) (uslock_check && \
527 ((l)->debug.state & USLOCK_CHECKED))
528
529 /*
530 * Trace activities of a particularly interesting lock.
531 */
532 void usl_trace(usimple_lock_t, int, pc_t, const char *);
533
534
535 /*
536 * Initialize the debugging information contained
537 * in a usimple_lock.
538 */
539 void
540 usld_lock_init(
541 usimple_lock_t l,
542 __unused unsigned short tag)
543 {
544 if (l == USIMPLE_LOCK_NULL)
545 panic("lock initialization: null lock pointer");
546 l->lock_type = USLOCK_TAG;
547 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
548 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
549 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
550 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
551 l->debug.duration[0] = l->debug.duration[1] = 0;
552 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
553 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
554 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
555 }
556
557
558 /*
559 * These checks apply to all usimple_locks, not just
560 * those with USLOCK_CHECKED turned on.
561 */
562 int
563 usld_lock_common_checks(
564 usimple_lock_t l,
565 char *caller)
566 {
567 if (l == USIMPLE_LOCK_NULL)
568 panic("%s: null lock pointer", caller);
569 if (l->lock_type != USLOCK_TAG)
570 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
571 if (!(l->debug.state & USLOCK_INIT))
572 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
573 return USLOCK_CHECKING(l);
574 }
575
576
577 /*
578 * Debug checks on a usimple_lock just before attempting
579 * to acquire it.
580 */
581 /* ARGSUSED */
582 void
583 usld_lock_pre(
584 usimple_lock_t l,
585 pc_t pc)
586 {
587 char caller[] = "usimple_lock";
588
589
590 if (!usld_lock_common_checks(l, caller))
591 return;
592
593 /*
594 * Note that we have a weird case where we are getting a lock when we are]
595 * in the process of putting the system to sleep. We are running with no
596 * current threads, therefore we can't tell if we are trying to retake a lock
597 * we have or someone on the other processor has it. Therefore we just
598 * ignore this test if the locking thread is 0.
599 */
600
601 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
602 l->debug.lock_thread == (void *) current_thread()) {
603 printf("%s: lock %p already locked (at %p) by",
604 caller, l, l->debug.lock_pc);
605 printf(" current thread %p (new attempt at pc %p)\n",
606 l->debug.lock_thread, pc);
607 panic("%s", caller);
608 }
609 mp_disable_preemption();
610 usl_trace(l, cpu_number(), pc, caller);
611 mp_enable_preemption();
612 }
613
614
615 /*
616 * Debug checks on a usimple_lock just after acquiring it.
617 *
618 * Pre-emption has been disabled at this point,
619 * so we are safe in using cpu_number.
620 */
621 void
622 usld_lock_post(
623 usimple_lock_t l,
624 pc_t pc)
625 {
626 int mycpu;
627 char caller[] = "successful usimple_lock";
628
629
630 if (!usld_lock_common_checks(l, caller))
631 return;
632
633 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
634 panic("%s: lock %p became uninitialized",
635 caller, l);
636 if ((l->debug.state & USLOCK_TAKEN))
637 panic("%s: lock 0x%p became TAKEN by someone else",
638 caller, l);
639
640 mycpu = cpu_number();
641 l->debug.lock_thread = (void *)current_thread();
642 l->debug.state |= USLOCK_TAKEN;
643 l->debug.lock_pc = pc;
644 l->debug.lock_cpu = mycpu;
645
646 usl_trace(l, mycpu, pc, caller);
647 }
648
649
650 /*
651 * Debug checks on a usimple_lock just before
652 * releasing it. Note that the caller has not
653 * yet released the hardware lock.
654 *
655 * Preemption is still disabled, so there's
656 * no problem using cpu_number.
657 */
658 void
659 usld_unlock(
660 usimple_lock_t l,
661 pc_t pc)
662 {
663 int mycpu;
664 char caller[] = "usimple_unlock";
665
666
667 if (!usld_lock_common_checks(l, caller))
668 return;
669
670 mycpu = cpu_number();
671
672 if (!(l->debug.state & USLOCK_TAKEN))
673 panic("%s: lock 0x%p hasn't been taken",
674 caller, l);
675 if (l->debug.lock_thread != (void *) current_thread())
676 panic("%s: unlocking lock 0x%p, owned by thread %p",
677 caller, l, l->debug.lock_thread);
678 if (l->debug.lock_cpu != mycpu) {
679 printf("%s: unlocking lock 0x%p on cpu 0x%x",
680 caller, l, mycpu);
681 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
682 panic("%s", caller);
683 }
684 usl_trace(l, mycpu, pc, caller);
685
686 l->debug.unlock_thread = l->debug.lock_thread;
687 l->debug.lock_thread = INVALID_PC;
688 l->debug.state &= ~USLOCK_TAKEN;
689 l->debug.unlock_pc = pc;
690 l->debug.unlock_cpu = mycpu;
691 }
692
693
694 /*
695 * Debug checks on a usimple_lock just before
696 * attempting to acquire it.
697 *
698 * Preemption isn't guaranteed to be disabled.
699 */
700 void
701 usld_lock_try_pre(
702 usimple_lock_t l,
703 pc_t pc)
704 {
705 char caller[] = "usimple_lock_try";
706
707 if (!usld_lock_common_checks(l, caller))
708 return;
709 mp_disable_preemption();
710 usl_trace(l, cpu_number(), pc, caller);
711 mp_enable_preemption();
712 }
713
714
715 /*
716 * Debug checks on a usimple_lock just after
717 * successfully attempting to acquire it.
718 *
719 * Preemption has been disabled by the
720 * lock acquisition attempt, so it's safe
721 * to use cpu_number.
722 */
723 void
724 usld_lock_try_post(
725 usimple_lock_t l,
726 pc_t pc)
727 {
728 int mycpu;
729 char caller[] = "successful usimple_lock_try";
730
731 if (!usld_lock_common_checks(l, caller))
732 return;
733
734 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
735 panic("%s: lock 0x%p became uninitialized",
736 caller, l);
737 if ((l->debug.state & USLOCK_TAKEN))
738 panic("%s: lock 0x%p became TAKEN by someone else",
739 caller, l);
740
741 mycpu = cpu_number();
742 l->debug.lock_thread = (void *) current_thread();
743 l->debug.state |= USLOCK_TAKEN;
744 l->debug.lock_pc = pc;
745 l->debug.lock_cpu = mycpu;
746
747 usl_trace(l, mycpu, pc, caller);
748 }
749
750
751 /*
752 * For very special cases, set traced_lock to point to a
753 * specific lock of interest. The result is a series of
754 * XPRs showing lock operations on that lock. The lock_seq
755 * value is used to show the order of those operations.
756 */
757 usimple_lock_t traced_lock;
758 unsigned int lock_seq;
759
760 void
761 usl_trace(
762 usimple_lock_t l,
763 int mycpu,
764 pc_t pc,
765 const char * op_name)
766 {
767 if (traced_lock == l) {
768 XPR(XPR_SLOCK,
769 "seq %d, cpu %d, %s @ %x\n",
770 (uintptr_t) lock_seq, (uintptr_t) mycpu,
771 (uintptr_t) op_name, (uintptr_t) pc, 0);
772 lock_seq++;
773 }
774 }
775
776
777 #endif /* USLOCK_DEBUG */
778
779 /*
780 * Routine: lck_rw_alloc_init
781 */
782 lck_rw_t *
783 lck_rw_alloc_init(
784 lck_grp_t *grp,
785 lck_attr_t *attr) {
786 lck_rw_t *lck;
787
788 if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
789 bzero(lck, sizeof(lck_rw_t));
790 lck_rw_init(lck, grp, attr);
791 }
792
793 return(lck);
794 }
795
796 /*
797 * Routine: lck_rw_free
798 */
799 void
800 lck_rw_free(
801 lck_rw_t *lck,
802 lck_grp_t *grp) {
803 lck_rw_destroy(lck, grp);
804 kfree(lck, sizeof(lck_rw_t));
805 }
806
807 /*
808 * Routine: lck_rw_init
809 */
810 void
811 lck_rw_init(
812 lck_rw_t *lck,
813 lck_grp_t *grp,
814 lck_attr_t *attr)
815 {
816 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
817 attr : &LockDefaultLckAttr;
818
819 hw_lock_byte_init(&lck->lck_rw_interlock);
820 lck->lck_rw_want_write = FALSE;
821 lck->lck_rw_want_upgrade = FALSE;
822 lck->lck_rw_shared_count = 0;
823 lck->lck_rw_can_sleep = TRUE;
824 lck->lck_r_waiting = lck->lck_w_waiting = 0;
825 lck->lck_rw_tag = 0;
826 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
827 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
828
829 lck_grp_reference(grp);
830 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
831 }
832
833 /*
834 * Routine: lck_rw_destroy
835 */
836 void
837 lck_rw_destroy(
838 lck_rw_t *lck,
839 lck_grp_t *grp)
840 {
841 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
842 return;
843 #if MACH_LDEBUG
844 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
845 #endif
846 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
847 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
848 lck_grp_deallocate(grp);
849 return;
850 }
851
852 /*
853 * Sleep locks. These use the same data structure and algorithm
854 * as the spin locks, but the process sleeps while it is waiting
855 * for the lock. These work on uniprocessor systems.
856 */
857
858 #define DECREMENTER_TIMEOUT 1000000
859
860 /*
861 * We disable interrupts while holding the RW interlock to prevent an
862 * interrupt from exacerbating hold time.
863 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
864 */
865 static boolean_t
866 lck_interlock_lock(lck_rw_t *lck)
867 {
868 boolean_t istate;
869
870 istate = ml_set_interrupts_enabled(FALSE);
871 hw_lock_byte_lock(&lck->lck_rw_interlock);
872
873 return istate;
874 }
875
876 static void
877 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
878 {
879 hw_lock_byte_unlock(&lck->lck_rw_interlock);
880 ml_set_interrupts_enabled(istate);
881 }
882
883 /*
884 * This inline is used when busy-waiting for an rw lock.
885 * If interrupts were disabled when the lock primitive was called,
886 * we poll the IPI handler for pending tlb flushes.
887 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
888 */
889 static inline void
890 lck_rw_lock_pause(boolean_t interrupts_enabled)
891 {
892 if (!interrupts_enabled)
893 handle_pending_TLB_flushes();
894 cpu_pause();
895 }
896
897
898 /*
899 * compute the deadline to spin against when
900 * waiting for a change of state on a lck_rw_t
901 */
902 static inline uint64_t
903 lck_rw_deadline_for_spin(lck_rw_t *lck)
904 {
905 if (lck->lck_rw_can_sleep) {
906 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
907 /*
908 * there are already threads waiting on this lock... this
909 * implies that they have spun beyond their deadlines waiting for
910 * the desired state to show up so we will not bother spinning at this time...
911 * or
912 * the current number of threads sharing this lock exceeds our capacity to run them
913 * concurrently and since all states we're going to spin for require the rw_shared_count
914 * to be at 0, we'll not bother spinning since the latency for this to happen is
915 * unpredictable...
916 */
917 return (mach_absolute_time());
918 }
919 return (mach_absolute_time() + MutexSpin);
920 } else
921 return (mach_absolute_time() + (100000LL * 1000000000LL));
922 }
923
924
925 /*
926 * Routine: lck_rw_lock_exclusive
927 */
928 void
929 lck_rw_lock_exclusive_gen(
930 lck_rw_t *lck)
931 {
932 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
933 uint64_t deadline = 0;
934 int slept = 0;
935 int gotlock = 0;
936 int lockheld = 0;
937 wait_result_t res = 0;
938 boolean_t istate = -1;
939
940 #if CONFIG_DTRACE
941 boolean_t dtrace_ls_initialized = FALSE;
942 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
943 uint64_t wait_interval = 0;
944 int readers_at_sleep = 0;
945 #endif
946
947 /*
948 * Try to acquire the lck_rw_want_write bit.
949 */
950 while ( !lck_rw_grab_want(lck)) {
951
952 #if CONFIG_DTRACE
953 if (dtrace_ls_initialized == FALSE) {
954 dtrace_ls_initialized = TRUE;
955 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
956 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
957 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
958 if (dtrace_ls_enabled) {
959 /*
960 * Either sleeping or spinning is happening,
961 * start a timing of our delay interval now.
962 */
963 readers_at_sleep = lck->lck_rw_shared_count;
964 wait_interval = mach_absolute_time();
965 }
966 }
967 #endif
968 if (istate == -1)
969 istate = ml_get_interrupts_enabled();
970
971 deadline = lck_rw_deadline_for_spin(lck);
972
973 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
974
975 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
976 lck_rw_lock_pause(istate);
977
978 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
979
980 if (gotlock)
981 break;
982 /*
983 * if we get here, the deadline has expired w/o us
984 * being able to grab the lock exclusively
985 * check to see if we're allowed to do a thread_block
986 */
987 if (lck->lck_rw_can_sleep) {
988
989 istate = lck_interlock_lock(lck);
990
991 if (lck->lck_rw_want_write) {
992
993 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
994
995 lck->lck_w_waiting = TRUE;
996
997 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
998 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
999 lck_interlock_unlock(lck, istate);
1000
1001 if (res == THREAD_WAITING) {
1002 res = thread_block(THREAD_CONTINUE_NULL);
1003 slept++;
1004 }
1005 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1006 } else {
1007 lck->lck_rw_want_write = TRUE;
1008 lck_interlock_unlock(lck, istate);
1009 break;
1010 }
1011 }
1012 }
1013 /*
1014 * Wait for readers (and upgrades) to finish...
1015 * the test for these conditions must be done simultaneously with
1016 * a check of the interlock not being held since
1017 * the rw_shared_count will drop to 0 first and then want_upgrade
1018 * will be set to 1 in the shared_to_exclusive scenario... those
1019 * adjustments are done behind the interlock and represent an
1020 * atomic change in state and must be considered as such
1021 * however, once we see the read count at 0, the want_upgrade not set
1022 * and the interlock not held, we are safe to proceed
1023 */
1024 while (lck_rw_held_read_or_upgrade(lck)) {
1025
1026 #if CONFIG_DTRACE
1027 /*
1028 * Either sleeping or spinning is happening, start
1029 * a timing of our delay interval now. If we set it
1030 * to -1 we don't have accurate data so we cannot later
1031 * decide to record a dtrace spin or sleep event.
1032 */
1033 if (dtrace_ls_initialized == FALSE) {
1034 dtrace_ls_initialized = TRUE;
1035 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1036 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1037 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1038 if (dtrace_ls_enabled) {
1039 /*
1040 * Either sleeping or spinning is happening,
1041 * start a timing of our delay interval now.
1042 */
1043 readers_at_sleep = lck->lck_rw_shared_count;
1044 wait_interval = mach_absolute_time();
1045 }
1046 }
1047 #endif
1048 if (istate == -1)
1049 istate = ml_get_interrupts_enabled();
1050
1051 deadline = lck_rw_deadline_for_spin(lck);
1052
1053 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1054
1055 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1056 lck_rw_lock_pause(istate);
1057
1058 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1059
1060 if ( !lockheld)
1061 break;
1062 /*
1063 * if we get here, the deadline has expired w/o us
1064 * being able to grab the lock exclusively
1065 * check to see if we're allowed to do a thread_block
1066 */
1067 if (lck->lck_rw_can_sleep) {
1068
1069 istate = lck_interlock_lock(lck);
1070
1071 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1072 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1073
1074 lck->lck_w_waiting = TRUE;
1075
1076 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1077 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1078 lck_interlock_unlock(lck, istate);
1079
1080 if (res == THREAD_WAITING) {
1081 res = thread_block(THREAD_CONTINUE_NULL);
1082 slept++;
1083 }
1084 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1085 } else {
1086 lck_interlock_unlock(lck, istate);
1087 /*
1088 * must own the lock now, since we checked for
1089 * readers or upgrade owner behind the interlock
1090 * no need for a call to 'lck_rw_held_read_or_upgrade'
1091 */
1092 break;
1093 }
1094 }
1095 }
1096
1097 #if CONFIG_DTRACE
1098 /*
1099 * Decide what latencies we suffered that are Dtrace events.
1100 * If we have set wait_interval, then we either spun or slept.
1101 * At least we get out from under the interlock before we record
1102 * which is the best we can do here to minimize the impact
1103 * of the tracing.
1104 * If we have set wait_interval to -1, then dtrace was not enabled when we
1105 * started sleeping/spinning so we don't record this event.
1106 */
1107 if (dtrace_ls_enabled == TRUE) {
1108 if (slept == 0) {
1109 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1110 mach_absolute_time() - wait_interval, 1);
1111 } else {
1112 /*
1113 * For the blocking case, we also record if when we blocked
1114 * it was held for read or write, and how many readers.
1115 * Notice that above we recorded this before we dropped
1116 * the interlock so the count is accurate.
1117 */
1118 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1119 mach_absolute_time() - wait_interval, 1,
1120 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1121 }
1122 }
1123 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1124 #endif
1125 }
1126
1127
1128 /*
1129 * Routine: lck_rw_done_gen
1130 *
1131 * called from the assembly language wrapper...
1132 * prior_lock_state is the value in the 1st
1133 * word of the lock at the time of a successful
1134 * atomic compare and exchange with the new value...
1135 * it represents the state of the lock before we
1136 * decremented the rw_shared_count or cleared either
1137 * rw_want_upgrade or rw_want_write and
1138 * the lck_x_waiting bits... since the wrapper
1139 * routine has already changed the state atomically,
1140 * we just need to decide if we should
1141 * wake up anyone and what value to return... we do
1142 * this by examining the state of the lock before
1143 * we changed it
1144 */
1145 lck_rw_type_t
1146 lck_rw_done_gen(
1147 lck_rw_t *lck,
1148 int prior_lock_state)
1149 {
1150 lck_rw_t *fake_lck;
1151 lck_rw_type_t lock_type;
1152 thread_t thread;
1153 uint32_t rwlock_count;
1154
1155 /*
1156 * prior_lock state is a snapshot of the 1st word of the
1157 * lock in question... we'll fake up a pointer to it
1158 * and carefully not access anything beyond whats defined
1159 * in the first word of a lck_rw_t
1160 */
1161 fake_lck = (lck_rw_t *)&prior_lock_state;
1162
1163 if (fake_lck->lck_rw_shared_count <= 1) {
1164 if (fake_lck->lck_w_waiting)
1165 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1166
1167 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1168 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1169 }
1170 if (fake_lck->lck_rw_shared_count)
1171 lock_type = LCK_RW_TYPE_SHARED;
1172 else
1173 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1174
1175 /* Check if dropping the lock means that we need to unpromote */
1176 thread = current_thread();
1177 rwlock_count = thread->rwlock_count--;
1178 #if MACH_LDEBUG
1179 if (rwlock_count == 0) {
1180 panic("rw lock count underflow for thread %p", thread);
1181 }
1182 #endif
1183 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1184 /* sched_flags checked without lock, but will be rechecked while clearing */
1185 lck_rw_clear_promotion(thread);
1186 }
1187
1188 #if CONFIG_DTRACE
1189 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1190 #endif
1191
1192 return(lock_type);
1193 }
1194
1195
1196 /*
1197 * Routine: lck_rw_unlock
1198 */
1199 void
1200 lck_rw_unlock(
1201 lck_rw_t *lck,
1202 lck_rw_type_t lck_rw_type)
1203 {
1204 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1205 lck_rw_unlock_shared(lck);
1206 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1207 lck_rw_unlock_exclusive(lck);
1208 else
1209 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1210 }
1211
1212
1213 /*
1214 * Routine: lck_rw_unlock_shared
1215 */
1216 void
1217 lck_rw_unlock_shared(
1218 lck_rw_t *lck)
1219 {
1220 lck_rw_type_t ret;
1221
1222 ret = lck_rw_done(lck);
1223
1224 if (ret != LCK_RW_TYPE_SHARED)
1225 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1226 }
1227
1228
1229 /*
1230 * Routine: lck_rw_unlock_exclusive
1231 */
1232 void
1233 lck_rw_unlock_exclusive(
1234 lck_rw_t *lck)
1235 {
1236 lck_rw_type_t ret;
1237
1238 ret = lck_rw_done(lck);
1239
1240 if (ret != LCK_RW_TYPE_EXCLUSIVE)
1241 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1242 }
1243
1244
1245 /*
1246 * Routine: lck_rw_lock
1247 */
1248 void
1249 lck_rw_lock(
1250 lck_rw_t *lck,
1251 lck_rw_type_t lck_rw_type)
1252 {
1253 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1254 lck_rw_lock_shared(lck);
1255 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1256 lck_rw_lock_exclusive(lck);
1257 else
1258 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1259 }
1260
1261
1262 /*
1263 * Routine: lck_rw_lock_shared_gen
1264 * Function:
1265 * assembly fast path code has determined that this lock
1266 * is held exclusively... this is where we spin/block
1267 * until we can acquire the lock in the shared mode
1268 */
1269 void
1270 lck_rw_lock_shared_gen(
1271 lck_rw_t *lck)
1272 {
1273 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1274 uint64_t deadline = 0;
1275 int gotlock = 0;
1276 int slept = 0;
1277 wait_result_t res = 0;
1278 boolean_t istate = -1;
1279
1280 #if CONFIG_DTRACE
1281 uint64_t wait_interval = 0;
1282 int readers_at_sleep = 0;
1283 boolean_t dtrace_ls_initialized = FALSE;
1284 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1285 #endif
1286
1287 while ( !lck_rw_grab_shared(lck)) {
1288
1289 #if CONFIG_DTRACE
1290 if (dtrace_ls_initialized == FALSE) {
1291 dtrace_ls_initialized = TRUE;
1292 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1293 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1294 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1295 if (dtrace_ls_enabled) {
1296 /*
1297 * Either sleeping or spinning is happening,
1298 * start a timing of our delay interval now.
1299 */
1300 readers_at_sleep = lck->lck_rw_shared_count;
1301 wait_interval = mach_absolute_time();
1302 }
1303 }
1304 #endif
1305 if (istate == -1)
1306 istate = ml_get_interrupts_enabled();
1307
1308 deadline = lck_rw_deadline_for_spin(lck);
1309
1310 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1311 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1312
1313 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1314 lck_rw_lock_pause(istate);
1315
1316 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1317 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1318
1319 if (gotlock)
1320 break;
1321 /*
1322 * if we get here, the deadline has expired w/o us
1323 * being able to grab the lock for read
1324 * check to see if we're allowed to do a thread_block
1325 */
1326 if (lck->lck_rw_can_sleep) {
1327
1328 istate = lck_interlock_lock(lck);
1329
1330 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1331 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1332
1333 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1334 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1335
1336 lck->lck_r_waiting = TRUE;
1337
1338 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1339 res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1340 lck_interlock_unlock(lck, istate);
1341
1342 if (res == THREAD_WAITING) {
1343 res = thread_block(THREAD_CONTINUE_NULL);
1344 slept++;
1345 }
1346 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1347 trace_lck, res, slept, 0, 0);
1348 } else {
1349 lck->lck_rw_shared_count++;
1350 lck_interlock_unlock(lck, istate);
1351 break;
1352 }
1353 }
1354 }
1355
1356 #if CONFIG_DTRACE
1357 if (dtrace_ls_enabled == TRUE) {
1358 if (slept == 0) {
1359 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1360 } else {
1361 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1362 mach_absolute_time() - wait_interval, 0,
1363 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1364 }
1365 }
1366 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1367 #endif
1368 }
1369
1370
1371 /*
1372 * Routine: lck_rw_lock_shared_to_exclusive_failure
1373 * Function:
1374 * assembly fast path code has already dropped our read
1375 * count and determined that someone else owns 'lck_rw_want_upgrade'
1376 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1377 * all we need to do here is determine if a wakeup is needed
1378 */
1379 boolean_t
1380 lck_rw_lock_shared_to_exclusive_failure(
1381 lck_rw_t *lck,
1382 int prior_lock_state)
1383 {
1384 lck_rw_t *fake_lck;
1385 thread_t thread = current_thread();
1386 uint32_t rwlock_count;
1387
1388 /* Check if dropping the lock means that we need to unpromote */
1389 rwlock_count = thread->rwlock_count--;
1390 #if MACH_LDEBUG
1391 if (rwlock_count == 0) {
1392 panic("rw lock count underflow for thread %p", thread);
1393 }
1394 #endif
1395 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1396 /* sched_flags checked without lock, but will be rechecked while clearing */
1397 lck_rw_clear_promotion(thread);
1398 }
1399
1400 /*
1401 * prior_lock state is a snapshot of the 1st word of the
1402 * lock in question... we'll fake up a pointer to it
1403 * and carefully not access anything beyond whats defined
1404 * in the first word of a lck_rw_t
1405 */
1406 fake_lck = (lck_rw_t *)&prior_lock_state;
1407
1408 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1409 /*
1410 * Someone else has requested upgrade.
1411 * Since we've released the read lock, wake
1412 * him up if he's blocked waiting
1413 */
1414 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1415 }
1416 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1417 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1418
1419 return (FALSE);
1420 }
1421
1422
1423 /*
1424 * Routine: lck_rw_lock_shared_to_exclusive_failure
1425 * Function:
1426 * assembly fast path code has already dropped our read
1427 * count and successfully acquired 'lck_rw_want_upgrade'
1428 * we just need to wait for the rest of the readers to drain
1429 * and then we can return as the exclusive holder of this lock
1430 */
1431 boolean_t
1432 lck_rw_lock_shared_to_exclusive_success(
1433 lck_rw_t *lck)
1434 {
1435 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1436 uint64_t deadline = 0;
1437 int slept = 0;
1438 int still_shared = 0;
1439 wait_result_t res;
1440 boolean_t istate = -1;
1441
1442 #if CONFIG_DTRACE
1443 uint64_t wait_interval = 0;
1444 int readers_at_sleep = 0;
1445 boolean_t dtrace_ls_initialized = FALSE;
1446 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1447 #endif
1448
1449 while (lck->lck_rw_shared_count != 0) {
1450
1451 #if CONFIG_DTRACE
1452 if (dtrace_ls_initialized == FALSE) {
1453 dtrace_ls_initialized = TRUE;
1454 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1455 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1456 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1457 if (dtrace_ls_enabled) {
1458 /*
1459 * Either sleeping or spinning is happening,
1460 * start a timing of our delay interval now.
1461 */
1462 readers_at_sleep = lck->lck_rw_shared_count;
1463 wait_interval = mach_absolute_time();
1464 }
1465 }
1466 #endif
1467 if (istate == -1)
1468 istate = ml_get_interrupts_enabled();
1469
1470 deadline = lck_rw_deadline_for_spin(lck);
1471
1472 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1473 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1474
1475 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1476 lck_rw_lock_pause(istate);
1477
1478 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1479 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1480
1481 if ( !still_shared)
1482 break;
1483 /*
1484 * if we get here, the deadline has expired w/o
1485 * the rw_shared_count having drained to 0
1486 * check to see if we're allowed to do a thread_block
1487 */
1488 if (lck->lck_rw_can_sleep) {
1489
1490 istate = lck_interlock_lock(lck);
1491
1492 if (lck->lck_rw_shared_count != 0) {
1493 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1494 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1495
1496 lck->lck_w_waiting = TRUE;
1497
1498 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1499 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1500 lck_interlock_unlock(lck, istate);
1501
1502 if (res == THREAD_WAITING) {
1503 res = thread_block(THREAD_CONTINUE_NULL);
1504 slept++;
1505 }
1506 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1507 trace_lck, res, slept, 0, 0);
1508 } else {
1509 lck_interlock_unlock(lck, istate);
1510 break;
1511 }
1512 }
1513 }
1514 #if CONFIG_DTRACE
1515 /*
1516 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1517 */
1518 if (dtrace_ls_enabled == TRUE) {
1519 if (slept == 0) {
1520 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1521 } else {
1522 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1523 mach_absolute_time() - wait_interval, 1,
1524 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1525 }
1526 }
1527 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1528 #endif
1529 return (TRUE);
1530 }
1531
1532
1533 /*
1534 * Routine: lck_rw_lock_exclusive_to_shared
1535 * Function:
1536 * assembly fast path has already dropped
1537 * our exclusive state and bumped lck_rw_shared_count
1538 * all we need to do here is determine if anyone
1539 * needs to be awakened.
1540 */
1541 void
1542 lck_rw_lock_exclusive_to_shared_gen(
1543 lck_rw_t *lck,
1544 int prior_lock_state)
1545 {
1546 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1547 lck_rw_t *fake_lck;
1548
1549 /*
1550 * prior_lock state is a snapshot of the 1st word of the
1551 * lock in question... we'll fake up a pointer to it
1552 * and carefully not access anything beyond whats defined
1553 * in the first word of a lck_rw_t
1554 */
1555 fake_lck = (lck_rw_t *)&prior_lock_state;
1556
1557 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1558 trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1559
1560 /*
1561 * don't wake up anyone waiting to take the lock exclusively
1562 * since we hold a read count... when the read count drops to 0,
1563 * the writers will be woken.
1564 *
1565 * wake up any waiting readers if we don't have any writers waiting,
1566 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1567 */
1568 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1569 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1570
1571 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1572 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1573
1574 #if CONFIG_DTRACE
1575 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1576 #endif
1577 }
1578
1579
1580 /*
1581 * Routine: lck_rw_try_lock
1582 */
1583 boolean_t
1584 lck_rw_try_lock(
1585 lck_rw_t *lck,
1586 lck_rw_type_t lck_rw_type)
1587 {
1588 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1589 return(lck_rw_try_lock_shared(lck));
1590 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1591 return(lck_rw_try_lock_exclusive(lck));
1592 else
1593 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1594 return(FALSE);
1595 }
1596
1597
1598 void
1599 lck_rw_assert(
1600 lck_rw_t *lck,
1601 unsigned int type)
1602 {
1603 switch (type) {
1604 case LCK_RW_ASSERT_SHARED:
1605 if (lck->lck_rw_shared_count != 0) {
1606 return;
1607 }
1608 break;
1609 case LCK_RW_ASSERT_EXCLUSIVE:
1610 if ((lck->lck_rw_want_write ||
1611 lck->lck_rw_want_upgrade) &&
1612 lck->lck_rw_shared_count == 0) {
1613 return;
1614 }
1615 break;
1616 case LCK_RW_ASSERT_HELD:
1617 if (lck->lck_rw_want_write ||
1618 lck->lck_rw_want_upgrade ||
1619 lck->lck_rw_shared_count != 0) {
1620 return;
1621 }
1622 break;
1623 case LCK_RW_ASSERT_NOTHELD:
1624 if (!(lck->lck_rw_want_write ||
1625 lck->lck_rw_want_upgrade ||
1626 lck->lck_rw_shared_count != 0)) {
1627 return;
1628 }
1629 break;
1630 default:
1631 break;
1632 }
1633
1634 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1635 }
1636
1637 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1638 void
1639 lck_rw_clear_promotions_x86(thread_t thread)
1640 {
1641 #if MACH_LDEBUG
1642 /* It's fatal to leave a RW lock locked and return to userspace */
1643 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1644 #else
1645 /* Paper over the issue */
1646 thread->rwlock_count = 0;
1647 lck_rw_clear_promotion(thread);
1648 #endif
1649 }
1650
1651
1652 /*
1653 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1654 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1655 */
1656 boolean_t
1657 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
1658 if (not_in_kdp) {
1659 panic("panic: rw lock exclusive check done outside of kernel debugger");
1660 }
1661 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
1662 }
1663
1664
1665 #ifdef MUTEX_ZONE
1666 extern zone_t lck_mtx_zone;
1667 #endif
1668 /*
1669 * Routine: lck_mtx_alloc_init
1670 */
1671 lck_mtx_t *
1672 lck_mtx_alloc_init(
1673 lck_grp_t *grp,
1674 lck_attr_t *attr)
1675 {
1676 lck_mtx_t *lck;
1677 #ifdef MUTEX_ZONE
1678 if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
1679 lck_mtx_init(lck, grp, attr);
1680 #else
1681 if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
1682 lck_mtx_init(lck, grp, attr);
1683 #endif
1684 return(lck);
1685 }
1686
1687 /*
1688 * Routine: lck_mtx_free
1689 */
1690 void
1691 lck_mtx_free(
1692 lck_mtx_t *lck,
1693 lck_grp_t *grp)
1694 {
1695 lck_mtx_destroy(lck, grp);
1696 #ifdef MUTEX_ZONE
1697 zfree(lck_mtx_zone, lck);
1698 #else
1699 kfree(lck, sizeof(lck_mtx_t));
1700 #endif
1701 }
1702
1703 /*
1704 * Routine: lck_mtx_ext_init
1705 */
1706 static void
1707 lck_mtx_ext_init(
1708 lck_mtx_ext_t *lck,
1709 lck_grp_t *grp,
1710 lck_attr_t *attr)
1711 {
1712 bzero((void *)lck, sizeof(lck_mtx_ext_t));
1713
1714 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1715 lck->lck_mtx_deb.type = MUTEX_TAG;
1716 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
1717 }
1718
1719 lck->lck_mtx_grp = grp;
1720
1721 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
1722 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
1723
1724 lck->lck_mtx.lck_mtx_is_ext = 1;
1725 lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
1726 }
1727
1728 /*
1729 * Routine: lck_mtx_init
1730 */
1731 void
1732 lck_mtx_init(
1733 lck_mtx_t *lck,
1734 lck_grp_t *grp,
1735 lck_attr_t *attr)
1736 {
1737 lck_mtx_ext_t *lck_ext;
1738 lck_attr_t *lck_attr;
1739
1740 if (attr != LCK_ATTR_NULL)
1741 lck_attr = attr;
1742 else
1743 lck_attr = &LockDefaultLckAttr;
1744
1745 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1746 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
1747 lck_mtx_ext_init(lck_ext, grp, lck_attr);
1748 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1749 lck->lck_mtx_ptr = lck_ext;
1750 }
1751 } else {
1752 lck->lck_mtx_owner = 0;
1753 lck->lck_mtx_state = 0;
1754 }
1755 lck->lck_mtx_pad32 = 0xFFFFFFFF;
1756 lck_grp_reference(grp);
1757 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1758 }
1759
1760 /*
1761 * Routine: lck_mtx_init_ext
1762 */
1763 void
1764 lck_mtx_init_ext(
1765 lck_mtx_t *lck,
1766 lck_mtx_ext_t *lck_ext,
1767 lck_grp_t *grp,
1768 lck_attr_t *attr)
1769 {
1770 lck_attr_t *lck_attr;
1771
1772 if (attr != LCK_ATTR_NULL)
1773 lck_attr = attr;
1774 else
1775 lck_attr = &LockDefaultLckAttr;
1776
1777 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1778 lck_mtx_ext_init(lck_ext, grp, lck_attr);
1779 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1780 lck->lck_mtx_ptr = lck_ext;
1781 } else {
1782 lck->lck_mtx_owner = 0;
1783 lck->lck_mtx_state = 0;
1784 }
1785 lck->lck_mtx_pad32 = 0xFFFFFFFF;
1786
1787 lck_grp_reference(grp);
1788 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1789 }
1790
1791 /*
1792 * Routine: lck_mtx_destroy
1793 */
1794 void
1795 lck_mtx_destroy(
1796 lck_mtx_t *lck,
1797 lck_grp_t *grp)
1798 {
1799 boolean_t lck_is_indirect;
1800
1801 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
1802 return;
1803 #if MACH_LDEBUG
1804 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1805 #endif
1806 lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1807
1808 lck_mtx_lock_mark_destroyed(lck);
1809
1810 if (lck_is_indirect)
1811 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
1812 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1813 lck_grp_deallocate(grp);
1814 return;
1815 }
1816
1817
1818 #define LCK_MTX_LCK_WAIT_CODE 0x20
1819 #define LCK_MTX_LCK_WAKEUP_CODE 0x21
1820 #define LCK_MTX_LCK_SPIN_CODE 0x22
1821 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23
1822 #define LCK_MTX_LCK_DEMOTE_CODE 0x24
1823
1824
1825 /*
1826 * Routine: lck_mtx_unlock_wakeup_x86
1827 *
1828 * Invoked on unlock when there is
1829 * contention (i.e. the assembly routine sees that
1830 * that mutex->lck_mtx_waiters != 0 or
1831 * that mutex->lck_mtx_promoted != 0...
1832 *
1833 * neither the mutex or interlock is held
1834 */
1835 void
1836 lck_mtx_unlock_wakeup_x86 (
1837 lck_mtx_t *mutex,
1838 int prior_lock_state)
1839 {
1840 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1841 lck_mtx_t fake_lck;
1842
1843 /*
1844 * prior_lock state is a snapshot of the 2nd word of the
1845 * lock in question... we'll fake up a lock with the bits
1846 * copied into place and carefully not access anything
1847 * beyond whats defined in the second word of a lck_mtx_t
1848 */
1849 fake_lck.lck_mtx_state = prior_lock_state;
1850
1851 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
1852 trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
1853
1854 if (__probable(fake_lck.lck_mtx_waiters)) {
1855 if (fake_lck.lck_mtx_waiters > 1)
1856 thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
1857 else
1858 thread_wakeup_one(LCK_MTX_EVENT(mutex));
1859 }
1860
1861 if (__improbable(fake_lck.lck_mtx_promoted)) {
1862 thread_t thread = current_thread();
1863
1864
1865 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
1866 thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
1867
1868 if (thread->promotions > 0) {
1869 spl_t s = splsched();
1870
1871 thread_lock(thread);
1872
1873 if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
1874
1875 thread->sched_flags &= ~TH_SFLAG_PROMOTED;
1876
1877 if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
1878 /* Thread still has a RW lock promotion */
1879 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1880 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1881 thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
1882
1883 set_sched_pri(thread, DEPRESSPRI);
1884 }
1885 else {
1886 if (thread->base_pri < thread->sched_pri) {
1887 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1888 thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
1889
1890 thread_recompute_sched_pri(thread, FALSE);
1891 }
1892 }
1893 }
1894 thread_unlock(thread);
1895 splx(s);
1896 }
1897 }
1898 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
1899 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1900 }
1901
1902
1903 /*
1904 * Routine: lck_mtx_lock_acquire_x86
1905 *
1906 * Invoked on acquiring the mutex when there is
1907 * contention (i.e. the assembly routine sees that
1908 * that mutex->lck_mtx_waiters != 0 or
1909 * thread->was_promoted_on_wakeup != 0)...
1910 *
1911 * mutex is owned... interlock is held... preemption is disabled
1912 */
1913 void
1914 lck_mtx_lock_acquire_x86(
1915 lck_mtx_t *mutex)
1916 {
1917 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1918 thread_t thread;
1919 integer_t priority;
1920 spl_t s;
1921
1922 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
1923 trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
1924
1925 if (mutex->lck_mtx_waiters)
1926 priority = mutex->lck_mtx_pri;
1927 else
1928 priority = 0;
1929
1930 thread = (thread_t)mutex->lck_mtx_owner; /* faster then current_thread() */
1931
1932 if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
1933
1934 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
1935 thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0);
1936
1937 s = splsched();
1938 thread_lock(thread);
1939
1940 if (thread->sched_pri < priority) {
1941 /* Do not promote past promotion ceiling */
1942 assert(priority <= MAXPRI_PROMOTE);
1943 set_sched_pri(thread, priority);
1944 }
1945 if (mutex->lck_mtx_promoted == 0) {
1946 mutex->lck_mtx_promoted = 1;
1947
1948 thread->promotions++;
1949 thread->sched_flags |= TH_SFLAG_PROMOTED;
1950 }
1951 thread->was_promoted_on_wakeup = 0;
1952
1953 thread_unlock(thread);
1954 splx(s);
1955 }
1956 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
1957 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1958 }
1959
1960
1961 static int
1962 lck_mtx_interlock_try_lock(lck_mtx_t *mutex, boolean_t *istate)
1963 {
1964 int retval;
1965
1966 *istate = ml_set_interrupts_enabled(FALSE);
1967 retval = lck_mtx_ilk_try_lock(mutex);
1968
1969 if (retval == 0)
1970 ml_set_interrupts_enabled(*istate);
1971
1972 return retval;
1973 }
1974
1975 static void
1976 lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
1977 {
1978 lck_mtx_ilk_unlock(mutex);
1979 ml_set_interrupts_enabled(istate);
1980 }
1981
1982
1983 /*
1984 * Routine: lck_mtx_lock_spinwait_x86
1985 *
1986 * Invoked trying to acquire a mutex when there is contention but
1987 * the holder is running on another processor. We spin for up to a maximum
1988 * time waiting for the lock to be released.
1989 *
1990 * Called with the interlock unlocked.
1991 * returns 0 if mutex acquired
1992 * returns 1 if we spun
1993 * returns 2 if we didn't spin due to the holder not running
1994 */
1995 int
1996 lck_mtx_lock_spinwait_x86(
1997 lck_mtx_t *mutex)
1998 {
1999 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2000 thread_t holder;
2001 uint64_t overall_deadline;
2002 uint64_t check_owner_deadline;
2003 uint64_t cur_time;
2004 int retval = 1;
2005 int loopcount = 0;
2006
2007 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
2008 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
2009
2010 cur_time = mach_absolute_time();
2011 overall_deadline = cur_time + MutexSpin;
2012 check_owner_deadline = cur_time;
2013
2014 /*
2015 * Spin while:
2016 * - mutex is locked, and
2017 * - its locked as a spin lock, and
2018 * - owner is running on another processor, and
2019 * - owner (processor) is not idling, and
2020 * - we haven't spun for long enough.
2021 */
2022 do {
2023 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
2024 retval = 0;
2025 break;
2026 }
2027 cur_time = mach_absolute_time();
2028
2029 if (cur_time >= overall_deadline)
2030 break;
2031
2032 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
2033 boolean_t istate;
2034
2035 if (lck_mtx_interlock_try_lock(mutex, &istate)) {
2036
2037 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
2038
2039 if ( !(holder->machine.specFlags & OnProc) ||
2040 (holder->state & TH_IDLE)) {
2041
2042 lck_mtx_interlock_unlock(mutex, istate);
2043
2044 if (loopcount == 0)
2045 retval = 2;
2046 break;
2047 }
2048 }
2049 lck_mtx_interlock_unlock(mutex, istate);
2050
2051 check_owner_deadline = cur_time + (MutexSpin / 4);
2052 }
2053 }
2054 cpu_pause();
2055
2056 loopcount++;
2057
2058 } while (TRUE);
2059
2060 #if CONFIG_DTRACE
2061 /*
2062 * We've already kept a count via overall_deadline of how long we spun.
2063 * If dtrace is active, then we compute backwards to decide how
2064 * long we spun.
2065 *
2066 * Note that we record a different probe id depending on whether
2067 * this is a direct or indirect mutex. This allows us to
2068 * penalize only lock groups that have debug/stats enabled
2069 * with dtrace processing if desired.
2070 */
2071 if (__probable(mutex->lck_mtx_is_ext == 0)) {
2072 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
2073 mach_absolute_time() - (overall_deadline - MutexSpin));
2074 } else {
2075 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
2076 mach_absolute_time() - (overall_deadline - MutexSpin));
2077 }
2078 /* The lockstat acquire event is recorded by the assembly code beneath us. */
2079 #endif
2080
2081 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2082 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
2083
2084 return retval;
2085 }
2086
2087
2088
2089 /*
2090 * Routine: lck_mtx_lock_wait_x86
2091 *
2092 * Invoked in order to wait on contention.
2093 *
2094 * Called with the interlock locked and
2095 * preemption disabled...
2096 * returns it unlocked and with preemption enabled
2097 */
2098 void
2099 lck_mtx_lock_wait_x86 (
2100 lck_mtx_t *mutex)
2101 {
2102 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2103 thread_t self = current_thread();
2104 thread_t holder;
2105 integer_t priority;
2106 spl_t s;
2107 #if CONFIG_DTRACE
2108 uint64_t sleep_start = 0;
2109
2110 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
2111 sleep_start = mach_absolute_time();
2112 }
2113 #endif
2114 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
2115 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2116
2117 priority = self->sched_pri;
2118
2119 if (priority < self->base_pri)
2120 priority = self->base_pri;
2121 if (priority < BASEPRI_DEFAULT)
2122 priority = BASEPRI_DEFAULT;
2123
2124 /* Do not promote past promotion ceiling */
2125 priority = MIN(priority, MAXPRI_PROMOTE);
2126
2127 if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
2128 mutex->lck_mtx_pri = priority;
2129 mutex->lck_mtx_waiters++;
2130
2131 if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
2132 holder->sched_pri < mutex->lck_mtx_pri ) {
2133 s = splsched();
2134 thread_lock(holder);
2135
2136 /* holder priority may have been bumped by another thread
2137 * before thread_lock was taken
2138 */
2139 if (holder->sched_pri < mutex->lck_mtx_pri) {
2140 KERNEL_DEBUG_CONSTANT(
2141 MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2142 holder->sched_pri, priority, thread_tid(holder), trace_lck, 0);
2143 /* Assert that we're not altering the priority of a
2144 * thread above the MAXPRI_PROMOTE band
2145 */
2146 assert(holder->sched_pri < MAXPRI_PROMOTE);
2147 set_sched_pri(holder, priority);
2148
2149 if (mutex->lck_mtx_promoted == 0) {
2150 holder->promotions++;
2151 holder->sched_flags |= TH_SFLAG_PROMOTED;
2152
2153 mutex->lck_mtx_promoted = 1;
2154 }
2155 }
2156 thread_unlock(holder);
2157 splx(s);
2158 }
2159 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
2160 assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
2161
2162 lck_mtx_ilk_unlock(mutex);
2163
2164 thread_block(THREAD_CONTINUE_NULL);
2165
2166 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2167 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2168
2169 #if CONFIG_DTRACE
2170 /*
2171 * Record the Dtrace lockstat probe for blocking, block time
2172 * measured from when we were entered.
2173 */
2174 if (sleep_start) {
2175 if (mutex->lck_mtx_is_ext == 0) {
2176 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2177 mach_absolute_time() - sleep_start);
2178 } else {
2179 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2180 mach_absolute_time() - sleep_start);
2181 }
2182 }
2183 #endif
2184 }
2185
2186 /*
2187 * Routine: kdp_lck_mtx_lock_spin_is_acquired
2188 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2189 * Returns: TRUE if lock is acquired.
2190 */
2191 boolean_t
2192 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
2193 {
2194 if (not_in_kdp) {
2195 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
2196 }
2197
2198 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
2199 return TRUE;
2200 }
2201
2202 return FALSE;
2203 }
2204
2205 void
2206 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2207 {
2208 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
2209 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2210 thread_t holder = (thread_t)mutex->lck_mtx_owner;
2211 waitinfo->owner = thread_tid(holder);
2212 }
2213
2214 void
2215 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2216 {
2217 lck_rw_t *rwlck = NULL;
2218 switch(waitinfo->wait_type) {
2219 case kThreadWaitKernelRWLockRead:
2220 rwlck = READ_EVENT_TO_RWLOCK(event);
2221 break;
2222 case kThreadWaitKernelRWLockWrite:
2223 case kThreadWaitKernelRWLockUpgrade:
2224 rwlck = WRITE_EVENT_TO_RWLOCK(event);
2225 break;
2226 default:
2227 panic("%s was called with an invalid blocking type", __FUNCTION__);
2228 break;
2229 }
2230 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2231 waitinfo->owner = 0;
2232 }