]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/i386/locks_i386.c
xnu-1228.7.58.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
64#include <mach_kdb.h>
65#include <mach_ldebug.h>
66
67#include <kern/lock.h>
68#include <kern/locks.h>
69#include <kern/kalloc.h>
70#include <kern/misc_protos.h>
71#include <kern/thread.h>
72#include <kern/processor.h>
73#include <kern/cpu_data.h>
74#include <kern/cpu_number.h>
75#include <kern/sched_prim.h>
76#include <kern/xpr.h>
77#include <kern/debug.h>
78#include <string.h>
79
80#if MACH_KDB
81#include <ddb/db_command.h>
82#include <ddb/db_output.h>
83#include <ddb/db_sym.h>
84#include <ddb/db_print.h>
85#endif /* MACH_KDB */
86
87#include <i386/machine_cpu.h>
88
89#include <sys/kdebug.h>
90
91/*
92 * We need only enough declarations from the BSD-side to be able to
93 * test if our probe is active, and to call __dtrace_probe(). Setting
94 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
95 */
96#if CONFIG_DTRACE
97#define NEED_DTRACE_DEFS
98#include <../bsd/sys/lockstat.h>
99#endif
100
101#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
102#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
103#define LCK_RW_LCK_SHARED_CODE 0x102
104#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
105#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
106#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
107
108#define LCK_MTX_LCK_SPIN 0x200
109
110#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
111
112unsigned int LcksOpts=0;
113unsigned int lock_wait_time[2] = { (unsigned int)-1, 0 } ;
114
115/* Forwards */
116
117#if MACH_KDB
118void db_print_simple_lock(
119 simple_lock_t addr);
120
121void db_print_mutex(
122 mutex_t * addr);
123#endif /* MACH_KDB */
124
125
126#if USLOCK_DEBUG
127/*
128 * Perform simple lock checks.
129 */
130int uslock_check = 1;
131int max_lock_loops = 100000000;
132decl_simple_lock_data(extern , printf_lock)
133decl_simple_lock_data(extern , panic_lock)
134#if MACH_KDB
135decl_simple_lock_data(extern , kdb_lock)
136#endif /* MACH_KDB */
137#endif /* USLOCK_DEBUG */
138
139
140/*
141 * We often want to know the addresses of the callers
142 * of the various lock routines. However, this information
143 * is only used for debugging and statistics.
144 */
145typedef void *pc_t;
146#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
147#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
148#if ANY_LOCK_DEBUG
149#define OBTAIN_PC(pc,l) ((pc) = (void *) GET_RETURN_PC(&(l)))
150#define DECL_PC(pc) pc_t pc;
151#else /* ANY_LOCK_DEBUG */
152#define DECL_PC(pc)
153#ifdef lint
154/*
155 * Eliminate lint complaints about unused local pc variables.
156 */
157#define OBTAIN_PC(pc,l) ++pc
158#else /* lint */
159#define OBTAIN_PC(pc,l)
160#endif /* lint */
161#endif /* USLOCK_DEBUG */
162
163
164/*
165 * Portable lock package implementation of usimple_locks.
166 */
167
168#if USLOCK_DEBUG
169#define USLDBG(stmt) stmt
170void usld_lock_init(usimple_lock_t, unsigned short);
171void usld_lock_pre(usimple_lock_t, pc_t);
172void usld_lock_post(usimple_lock_t, pc_t);
173void usld_unlock(usimple_lock_t, pc_t);
174void usld_lock_try_pre(usimple_lock_t, pc_t);
175void usld_lock_try_post(usimple_lock_t, pc_t);
176int usld_lock_common_checks(usimple_lock_t, char *);
177#else /* USLOCK_DEBUG */
178#define USLDBG(stmt)
179#endif /* USLOCK_DEBUG */
180
181/*
182 * Forward definitions
183 */
184
185void lck_rw_lock_shared_gen(
186 lck_rw_t *lck);
187
188lck_rw_type_t lck_rw_done_gen(
189 lck_rw_t *lck);
190
191/*
192 * Routine: lck_spin_alloc_init
193 */
194lck_spin_t *
195lck_spin_alloc_init(
196 lck_grp_t *grp,
197 lck_attr_t *attr)
198{
199 lck_spin_t *lck;
200
201 if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
202 lck_spin_init(lck, grp, attr);
203
204 return(lck);
205}
206
207/*
208 * Routine: lck_spin_free
209 */
210void
211lck_spin_free(
212 lck_spin_t *lck,
213 lck_grp_t *grp)
214{
215 lck_spin_destroy(lck, grp);
216 kfree(lck, sizeof(lck_spin_t));
217}
218
219/*
220 * Routine: lck_spin_init
221 */
222void
223lck_spin_init(
224 lck_spin_t *lck,
225 lck_grp_t *grp,
226 __unused lck_attr_t *attr)
227{
228 usimple_lock_init((usimple_lock_t) lck, 0);
229 lck_grp_reference(grp);
230 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
231}
232
233/*
234 * Routine: lck_spin_destroy
235 */
236void
237lck_spin_destroy(
238 lck_spin_t *lck,
239 lck_grp_t *grp)
240{
241 if (lck->lck_spin_data[0] == LCK_SPIN_TAG_DESTROYED)
242 return;
243 lck->lck_spin_data[0] = LCK_SPIN_TAG_DESTROYED;
244 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
245 lck_grp_deallocate(grp);
246 return;
247}
248
249/*
250 * Routine: lck_spin_lock
251 */
252void
253lck_spin_lock(
254 lck_spin_t *lck)
255{
256 usimple_lock((usimple_lock_t) lck);
257}
258
259/*
260 * Routine: lck_spin_unlock
261 */
262void
263lck_spin_unlock(
264 lck_spin_t *lck)
265{
266 usimple_unlock((usimple_lock_t) lck);
267}
268
269
270/*
271 * Routine: lck_spin_try_lock
272 */
273boolean_t
274lck_spin_try_lock(
275 lck_spin_t *lck)
276{
277 return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
278}
279
280/*
281 * Initialize a usimple_lock.
282 *
283 * No change in preemption state.
284 */
285void
286usimple_lock_init(
287 usimple_lock_t l,
288 __unused unsigned short tag)
289{
290#ifndef MACHINE_SIMPLE_LOCK
291 USLDBG(usld_lock_init(l, tag));
292 hw_lock_init(&l->interlock);
293#else
294 simple_lock_init((simple_lock_t)l,tag);
295#endif
296}
297
298
299/*
300 * Acquire a usimple_lock.
301 *
302 * Returns with preemption disabled. Note
303 * that the hw_lock routines are responsible for
304 * maintaining preemption state.
305 */
306void
307usimple_lock(
308 usimple_lock_t l)
309{
310#ifndef MACHINE_SIMPLE_LOCK
311 DECL_PC(pc);
312
313 OBTAIN_PC(pc, l);
314 USLDBG(usld_lock_pre(l, pc));
315
316 if(!hw_lock_to(&l->interlock, LockTimeOutTSC)) /* Try to get the lock with a timeout */
317 panic("simple lock deadlock detection: lock=%p, cpu=%d, owning thread=0x%x", l, cpu_number(), l->interlock.lock_data);
318
319 USLDBG(usld_lock_post(l, pc));
320#else
321 simple_lock((simple_lock_t)l);
322#endif
323}
324
325
326/*
327 * Release a usimple_lock.
328 *
329 * Returns with preemption enabled. Note
330 * that the hw_lock routines are responsible for
331 * maintaining preemption state.
332 */
333void
334usimple_unlock(
335 usimple_lock_t l)
336{
337#ifndef MACHINE_SIMPLE_LOCK
338 DECL_PC(pc);
339
340 OBTAIN_PC(pc, l);
341 USLDBG(usld_unlock(l, pc));
342 hw_lock_unlock(&l->interlock);
343#else
344 simple_unlock_rwmb((simple_lock_t)l);
345#endif
346}
347
348
349/*
350 * Conditionally acquire a usimple_lock.
351 *
352 * On success, returns with preemption disabled.
353 * On failure, returns with preemption in the same state
354 * as when first invoked. Note that the hw_lock routines
355 * are responsible for maintaining preemption state.
356 *
357 * XXX No stats are gathered on a miss; I preserved this
358 * behavior from the original assembly-language code, but
359 * doesn't it make sense to log misses? XXX
360 */
361unsigned int
362usimple_lock_try(
363 usimple_lock_t l)
364{
365#ifndef MACHINE_SIMPLE_LOCK
366 unsigned int success;
367 DECL_PC(pc);
368
369 OBTAIN_PC(pc, l);
370 USLDBG(usld_lock_try_pre(l, pc));
371 if ((success = hw_lock_try(&l->interlock))) {
372 USLDBG(usld_lock_try_post(l, pc));
373 }
374 return success;
375#else
376 return(simple_lock_try((simple_lock_t)l));
377#endif
378}
379
380#if USLOCK_DEBUG
381/*
382 * States of a usimple_lock. The default when initializing
383 * a usimple_lock is setting it up for debug checking.
384 */
385#define USLOCK_CHECKED 0x0001 /* lock is being checked */
386#define USLOCK_TAKEN 0x0002 /* lock has been taken */
387#define USLOCK_INIT 0xBAA0 /* lock has been initialized */
388#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
389#define USLOCK_CHECKING(l) (uslock_check && \
390 ((l)->debug.state & USLOCK_CHECKED))
391
392/*
393 * Trace activities of a particularly interesting lock.
394 */
395void usl_trace(usimple_lock_t, int, pc_t, const char *);
396
397
398/*
399 * Initialize the debugging information contained
400 * in a usimple_lock.
401 */
402void
403usld_lock_init(
404 usimple_lock_t l,
405 __unused unsigned short tag)
406{
407 if (l == USIMPLE_LOCK_NULL)
408 panic("lock initialization: null lock pointer");
409 l->lock_type = USLOCK_TAG;
410 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
411 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
412 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
413 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
414 l->debug.duration[0] = l->debug.duration[1] = 0;
415 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
416 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
417 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
418}
419
420
421/*
422 * These checks apply to all usimple_locks, not just
423 * those with USLOCK_CHECKED turned on.
424 */
425int
426usld_lock_common_checks(
427 usimple_lock_t l,
428 char *caller)
429{
430 if (l == USIMPLE_LOCK_NULL)
431 panic("%s: null lock pointer", caller);
432 if (l->lock_type != USLOCK_TAG)
433 panic("%s: 0x%x is not a usimple lock", caller, (integer_t) l);
434 if (!(l->debug.state & USLOCK_INIT))
435 panic("%s: 0x%x is not an initialized lock",
436 caller, (integer_t) l);
437 return USLOCK_CHECKING(l);
438}
439
440
441/*
442 * Debug checks on a usimple_lock just before attempting
443 * to acquire it.
444 */
445/* ARGSUSED */
446void
447usld_lock_pre(
448 usimple_lock_t l,
449 pc_t pc)
450{
451 char caller[] = "usimple_lock";
452
453
454 if (!usld_lock_common_checks(l, caller))
455 return;
456
457/*
458 * Note that we have a weird case where we are getting a lock when we are]
459 * in the process of putting the system to sleep. We are running with no
460 * current threads, therefore we can't tell if we are trying to retake a lock
461 * we have or someone on the other processor has it. Therefore we just
462 * ignore this test if the locking thread is 0.
463 */
464
465 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
466 l->debug.lock_thread == (void *) current_thread()) {
467 printf("%s: lock %p already locked (at %p) by",
468 caller, l, l->debug.lock_pc);
469 printf(" current thread %p (new attempt at pc %p)\n",
470 l->debug.lock_thread, pc);
471 panic("%s", caller);
472 }
473 mp_disable_preemption();
474 usl_trace(l, cpu_number(), pc, caller);
475 mp_enable_preemption();
476}
477
478
479/*
480 * Debug checks on a usimple_lock just after acquiring it.
481 *
482 * Pre-emption has been disabled at this point,
483 * so we are safe in using cpu_number.
484 */
485void
486usld_lock_post(
487 usimple_lock_t l,
488 pc_t pc)
489{
490 register int mycpu;
491 char caller[] = "successful usimple_lock";
492
493
494 if (!usld_lock_common_checks(l, caller))
495 return;
496
497 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
498 panic("%s: lock 0x%x became uninitialized",
499 caller, (integer_t) l);
500 if ((l->debug.state & USLOCK_TAKEN))
501 panic("%s: lock 0x%x became TAKEN by someone else",
502 caller, (integer_t) l);
503
504 mycpu = cpu_number();
505 l->debug.lock_thread = (void *)current_thread();
506 l->debug.state |= USLOCK_TAKEN;
507 l->debug.lock_pc = pc;
508 l->debug.lock_cpu = mycpu;
509
510 usl_trace(l, mycpu, pc, caller);
511}
512
513
514/*
515 * Debug checks on a usimple_lock just before
516 * releasing it. Note that the caller has not
517 * yet released the hardware lock.
518 *
519 * Preemption is still disabled, so there's
520 * no problem using cpu_number.
521 */
522void
523usld_unlock(
524 usimple_lock_t l,
525 pc_t pc)
526{
527 register int mycpu;
528 char caller[] = "usimple_unlock";
529
530
531 if (!usld_lock_common_checks(l, caller))
532 return;
533
534 mycpu = cpu_number();
535
536 if (!(l->debug.state & USLOCK_TAKEN))
537 panic("%s: lock 0x%x hasn't been taken",
538 caller, (integer_t) l);
539 if (l->debug.lock_thread != (void *) current_thread())
540 panic("%s: unlocking lock 0x%x, owned by thread %p",
541 caller, (integer_t) l, l->debug.lock_thread);
542 if (l->debug.lock_cpu != mycpu) {
543 printf("%s: unlocking lock 0x%x on cpu 0x%x",
544 caller, (integer_t) l, mycpu);
545 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
546 panic("%s", caller);
547 }
548 usl_trace(l, mycpu, pc, caller);
549
550 l->debug.unlock_thread = l->debug.lock_thread;
551 l->debug.lock_thread = INVALID_PC;
552 l->debug.state &= ~USLOCK_TAKEN;
553 l->debug.unlock_pc = pc;
554 l->debug.unlock_cpu = mycpu;
555}
556
557
558/*
559 * Debug checks on a usimple_lock just before
560 * attempting to acquire it.
561 *
562 * Preemption isn't guaranteed to be disabled.
563 */
564void
565usld_lock_try_pre(
566 usimple_lock_t l,
567 pc_t pc)
568{
569 char caller[] = "usimple_lock_try";
570
571 if (!usld_lock_common_checks(l, caller))
572 return;
573 mp_disable_preemption();
574 usl_trace(l, cpu_number(), pc, caller);
575 mp_enable_preemption();
576}
577
578
579/*
580 * Debug checks on a usimple_lock just after
581 * successfully attempting to acquire it.
582 *
583 * Preemption has been disabled by the
584 * lock acquisition attempt, so it's safe
585 * to use cpu_number.
586 */
587void
588usld_lock_try_post(
589 usimple_lock_t l,
590 pc_t pc)
591{
592 register int mycpu;
593 char caller[] = "successful usimple_lock_try";
594
595 if (!usld_lock_common_checks(l, caller))
596 return;
597
598 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
599 panic("%s: lock 0x%x became uninitialized",
600 caller, (integer_t) l);
601 if ((l->debug.state & USLOCK_TAKEN))
602 panic("%s: lock 0x%x became TAKEN by someone else",
603 caller, (integer_t) l);
604
605 mycpu = cpu_number();
606 l->debug.lock_thread = (void *) current_thread();
607 l->debug.state |= USLOCK_TAKEN;
608 l->debug.lock_pc = pc;
609 l->debug.lock_cpu = mycpu;
610
611 usl_trace(l, mycpu, pc, caller);
612}
613
614
615/*
616 * For very special cases, set traced_lock to point to a
617 * specific lock of interest. The result is a series of
618 * XPRs showing lock operations on that lock. The lock_seq
619 * value is used to show the order of those operations.
620 */
621usimple_lock_t traced_lock;
622unsigned int lock_seq;
623
624void
625usl_trace(
626 usimple_lock_t l,
627 int mycpu,
628 pc_t pc,
629 const char * op_name)
630{
631 if (traced_lock == l) {
632 XPR(XPR_SLOCK,
633 "seq %d, cpu %d, %s @ %x\n",
634 (integer_t) lock_seq, (integer_t) mycpu,
635 (integer_t) op_name, (integer_t) pc, 0);
636 lock_seq++;
637 }
638}
639
640
641#endif /* USLOCK_DEBUG */
642
643/*
644 * Routine: lock_alloc
645 * Function:
646 * Allocate a lock for external users who cannot
647 * hard-code the structure definition into their
648 * objects.
649 * For now just use kalloc, but a zone is probably
650 * warranted.
651 */
652lock_t *
653lock_alloc(
654 boolean_t can_sleep,
655 unsigned short tag,
656 unsigned short tag1)
657{
658 lock_t *l;
659
660 if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0)
661 lock_init(l, can_sleep, tag, tag1);
662 return(l);
663}
664
665/*
666 * Routine: lock_free
667 * Function:
668 * Free a lock allocated for external users.
669 * For now just use kfree, but a zone is probably
670 * warranted.
671 */
672void
673lock_free(
674 lock_t *l)
675{
676 kfree(l, sizeof(lock_t));
677}
678
679
680/*
681 * Routine: lock_init
682 * Function:
683 * Initialize a lock; required before use.
684 * Note that clients declare the "struct lock"
685 * variables and then initialize them, rather
686 * than getting a new one from this module.
687 */
688void
689lock_init(
690 lock_t *l,
691 boolean_t can_sleep,
692 __unused unsigned short tag,
693 __unused unsigned short tag1)
694{
695 hw_lock_byte_init(&l->lck_rw_interlock);
696 l->lck_rw_want_write = FALSE;
697 l->lck_rw_want_upgrade = FALSE;
698 l->lck_rw_shared_count = 0;
699 l->lck_rw_can_sleep = can_sleep;
700 l->lck_rw_tag = tag;
701 l->lck_rw_priv_excl = 1;
702}
703
704
705/*
706 * Sleep locks. These use the same data structure and algorithm
707 * as the spin locks, but the process sleeps while it is waiting
708 * for the lock. These work on uniprocessor systems.
709 */
710
711#define DECREMENTER_TIMEOUT 1000000
712
713void
714lock_write(
715 register lock_t * l)
716{
717 lck_rw_lock_exclusive(l);
718}
719
720void
721lock_done(
722 register lock_t * l)
723{
724 (void) lck_rw_done(l);
725}
726
727void
728lock_read(
729 register lock_t * l)
730{
731 lck_rw_lock_shared(l);
732}
733
734
735/*
736 * Routine: lock_read_to_write
737 * Function:
738 * Improves a read-only lock to one with
739 * write permission. If another reader has
740 * already requested an upgrade to a write lock,
741 * no lock is held upon return.
742 *
743 * Returns FALSE if the upgrade *failed*.
744 */
745
746boolean_t
747lock_read_to_write(
748 register lock_t * l)
749{
750 return lck_rw_lock_shared_to_exclusive(l);
751}
752
753void
754lock_write_to_read(
755 register lock_t * l)
756{
757 lck_rw_lock_exclusive_to_shared(l);
758}
759
760
761
762/*
763 * Routine: lck_rw_alloc_init
764 */
765lck_rw_t *
766lck_rw_alloc_init(
767 lck_grp_t *grp,
768 lck_attr_t *attr) {
769 lck_rw_t *lck;
770
771 if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0)
772 lck_rw_init(lck, grp, attr);
773
774 return(lck);
775}
776
777/*
778 * Routine: lck_rw_free
779 */
780void
781lck_rw_free(
782 lck_rw_t *lck,
783 lck_grp_t *grp) {
784 lck_rw_destroy(lck, grp);
785 kfree(lck, sizeof(lck_rw_t));
786}
787
788/*
789 * Routine: lck_rw_init
790 */
791void
792lck_rw_init(
793 lck_rw_t *lck,
794 lck_grp_t *grp,
795 lck_attr_t *attr)
796{
797 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
798 attr : &LockDefaultLckAttr;
799
800 hw_lock_byte_init(&lck->lck_rw_interlock);
801 lck->lck_rw_want_write = FALSE;
802 lck->lck_rw_want_upgrade = FALSE;
803 lck->lck_rw_shared_count = 0;
804 lck->lck_rw_can_sleep = TRUE;
805 lck->lck_rw_tag = 0;
806 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
807 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
808
809 lck_grp_reference(grp);
810 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
811}
812
813/*
814 * Routine: lck_rw_destroy
815 */
816void
817lck_rw_destroy(
818 lck_rw_t *lck,
819 lck_grp_t *grp) {
820 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
821 return;
822 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
823 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
824 lck_grp_deallocate(grp);
825 return;
826}
827
828/*
829 * Sleep locks. These use the same data structure and algorithm
830 * as the spin locks, but the process sleeps while it is waiting
831 * for the lock. These work on uniprocessor systems.
832 */
833
834#define DECREMENTER_TIMEOUT 1000000
835
836#define RW_LOCK_READER_EVENT(x) \
837 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
838
839#define RW_LOCK_WRITER_EVENT(x) \
840 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
841
842/*
843 * We need to disable interrupts while holding the mutex interlock
844 * to prevent an IPI intervening.
845 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
846 */
847static boolean_t
848lck_interlock_lock(lck_rw_t *lck)
849{
850 boolean_t istate;
851
852 istate = ml_set_interrupts_enabled(FALSE);
853 hw_lock_byte_lock(&lck->lck_rw_interlock);
854
855 return istate;
856}
857
858static void
859lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
860{
861 hw_lock_byte_unlock(&lck->lck_rw_interlock);
862 ml_set_interrupts_enabled(istate);
863}
864
865/*
866 * This inline is used when busy-waiting for an rw lock.
867 * If interrupts were disabled when the lock primitive was called,
868 * we poll the IPI handler for pending tlb flushes.
869 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
870 */
871static inline void
872lck_rw_lock_pause(boolean_t interrupts_enabled)
873{
874 if (!interrupts_enabled)
875 handle_pending_TLB_flushes();
876 cpu_pause();
877}
878
879/*
880 * Routine: lck_rw_lock_exclusive
881 */
882void
883lck_rw_lock_exclusive(
884 lck_rw_t *lck)
885{
886 int i;
887 wait_result_t res;
888#if MACH_LDEBUG
889 int decrementer;
890#endif /* MACH_LDEBUG */
891 boolean_t istate;
892#if CONFIG_DTRACE
893 uint64_t wait_interval = 0;
894 int slept = 0;
895 int readers_at_sleep;
896#endif
897
898 istate = lck_interlock_lock(lck);
899#if CONFIG_DTRACE
900 readers_at_sleep = lck->lck_rw_shared_count;
901#endif
902
903#if MACH_LDEBUG
904 decrementer = DECREMENTER_TIMEOUT;
905#endif /* MACH_LDEBUG */
906
907 /*
908 * Try to acquire the lck_rw_want_write bit.
909 */
910 while (lck->lck_rw_want_write) {
911
912 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
913 /*
914 * Either sleeping or spinning is happening, start
915 * a timing of our delay interval now.
916 */
917#if CONFIG_DTRACE
918 if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) {
919 wait_interval = mach_absolute_time();
920 } else {
921 wait_interval = -1;
922 }
923#endif
924
925
926 i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0];
927 if (i != 0) {
928 lck_interlock_unlock(lck, istate);
929#if MACH_LDEBUG
930 if (!--decrementer)
931 Debugger("timeout - lck_rw_want_write");
932#endif /* MACH_LDEBUG */
933 while (--i != 0 && lck->lck_rw_want_write)
934 lck_rw_lock_pause(istate);
935 istate = lck_interlock_lock(lck);
936 }
937
938 if (lck->lck_rw_can_sleep && lck->lck_rw_want_write) {
939 lck->lck_w_waiting = TRUE;
940 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
941 if (res == THREAD_WAITING) {
942 lck_interlock_unlock(lck, istate);
943 res = thread_block(THREAD_CONTINUE_NULL);
944#if CONFIG_DTRACE
945 slept = 1;
946#endif
947 istate = lck_interlock_lock(lck);
948 }
949 }
950 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)lck, res, 0, 0, 0);
951 }
952 lck->lck_rw_want_write = TRUE;
953
954 /* Wait for readers (and upgrades) to finish */
955
956#if MACH_LDEBUG
957 decrementer = DECREMENTER_TIMEOUT;
958#endif /* MACH_LDEBUG */
959 while ((lck->lck_rw_shared_count != 0) || lck->lck_rw_want_upgrade) {
960
961 i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0];
962
963 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START,
964 (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, i, 0);
965
966#if CONFIG_DTRACE
967 /*
968 * Either sleeping or spinning is happening, start
969 * a timing of our delay interval now. If we set it
970 * to -1 we don't have accurate data so we cannot later
971 * decide to record a dtrace spin or sleep event.
972 */
973 if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) {
974 wait_interval = mach_absolute_time();
975 } else {
976 wait_interval = (unsigned) -1;
977 }
978#endif
979
980 if (i != 0) {
981 lck_interlock_unlock(lck, istate);
982#if MACH_LDEBUG
983 if (!--decrementer)
984 Debugger("timeout - wait for readers");
985#endif /* MACH_LDEBUG */
986 while (--i != 0 && (lck->lck_rw_shared_count != 0 ||
987 lck->lck_rw_want_upgrade))
988 lck_rw_lock_pause(istate);
989 istate = lck_interlock_lock(lck);
990 }
991
992 if (lck->lck_rw_can_sleep && (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade)) {
993 lck->lck_w_waiting = TRUE;
994 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
995 if (res == THREAD_WAITING) {
996 lck_interlock_unlock(lck, istate);
997 res = thread_block(THREAD_CONTINUE_NULL);
998#if CONFIG_DTRACE
999 slept = 1;
1000#endif
1001 istate = lck_interlock_lock(lck);
1002 }
1003 }
1004 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END,
1005 (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, res, 0);
1006 }
1007
1008 lck_interlock_unlock(lck, istate);
1009#if CONFIG_DTRACE
1010 /*
1011 * Decide what latencies we suffered that are Dtrace events.
1012 * If we have set wait_interval, then we either spun or slept.
1013 * At least we get out from under the interlock before we record
1014 * which is the best we can do here to minimize the impact
1015 * of the tracing.
1016 * If we have set wait_interval to -1, then dtrace was not enabled when we
1017 * started sleeping/spinning so we don't record this event.
1018 */
1019 if (wait_interval != 0 && wait_interval != (unsigned) -1) {
1020 if (slept == 0) {
1021 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1022 mach_absolute_time() - wait_interval, 1);
1023 } else {
1024 /*
1025 * For the blocking case, we also record if when we blocked
1026 * it was held for read or write, and how many readers.
1027 * Notice that above we recorded this before we dropped
1028 * the interlock so the count is accurate.
1029 */
1030 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1031 mach_absolute_time() - wait_interval, 1,
1032 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1033 }
1034 }
1035 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1036#endif
1037}
1038
1039
1040/*
1041 * Routine: lck_rw_done_gen
1042 */
1043lck_rw_type_t
1044lck_rw_done_gen(
1045 lck_rw_t *lck)
1046{
1047 boolean_t wakeup_readers = FALSE;
1048 boolean_t wakeup_writers = FALSE;
1049 lck_rw_type_t lck_rw_type;
1050 boolean_t istate;
1051
1052 istate = lck_interlock_lock(lck);
1053
1054 if (lck->lck_rw_shared_count != 0) {
1055 lck_rw_type = LCK_RW_TYPE_SHARED;
1056 lck->lck_rw_shared_count--;
1057 }
1058 else {
1059 lck_rw_type = LCK_RW_TYPE_EXCLUSIVE;
1060 if (lck->lck_rw_want_upgrade)
1061 lck->lck_rw_want_upgrade = FALSE;
1062 else
1063 lck->lck_rw_want_write = FALSE;
1064 }
1065
1066 /*
1067 * There is no reason to wakeup a waiting thread
1068 * if the read-count is non-zero. Consider:
1069 * we must be dropping a read lock
1070 * threads are waiting only if one wants a write lock
1071 * if there are still readers, they can't proceed
1072 */
1073
1074 if (lck->lck_rw_shared_count == 0) {
1075 if (lck->lck_w_waiting) {
1076 lck->lck_w_waiting = FALSE;
1077 wakeup_writers = TRUE;
1078 }
1079 if (!(lck->lck_rw_priv_excl && wakeup_writers == TRUE) &&
1080 lck->lck_r_waiting) {
1081 lck->lck_r_waiting = FALSE;
1082 wakeup_readers = TRUE;
1083 }
1084 }
1085
1086 lck_interlock_unlock(lck, istate);
1087
1088 if (wakeup_readers)
1089 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1090 if (wakeup_writers)
1091 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1092
1093#if CONFIG_DTRACE
1094 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE ? 1 : 0));
1095#endif
1096
1097 return(lck_rw_type);
1098}
1099
1100
1101
1102
1103/*
1104 * Routine: lck_rw_unlock
1105 */
1106void
1107lck_rw_unlock(
1108 lck_rw_t *lck,
1109 lck_rw_type_t lck_rw_type)
1110{
1111 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1112 lck_rw_unlock_shared(lck);
1113 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1114 lck_rw_unlock_exclusive(lck);
1115 else
1116 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1117}
1118
1119
1120/*
1121 * Routine: lck_rw_unlock_shared
1122 */
1123void
1124lck_rw_unlock_shared(
1125 lck_rw_t *lck)
1126{
1127 lck_rw_type_t ret;
1128
1129 ret = lck_rw_done(lck);
1130
1131 if (ret != LCK_RW_TYPE_SHARED)
1132 panic("lck_rw_unlock(): lock held in mode: %d\n", ret);
1133}
1134
1135
1136/*
1137 * Routine: lck_rw_unlock_exclusive
1138 */
1139void
1140lck_rw_unlock_exclusive(
1141 lck_rw_t *lck)
1142{
1143 lck_rw_type_t ret;
1144
1145 ret = lck_rw_done(lck);
1146
1147 if (ret != LCK_RW_TYPE_EXCLUSIVE)
1148 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1149}
1150
1151
1152/*
1153 * Routine: lck_rw_lock
1154 */
1155void
1156lck_rw_lock(
1157 lck_rw_t *lck,
1158 lck_rw_type_t lck_rw_type)
1159{
1160 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1161 lck_rw_lock_shared(lck);
1162 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1163 lck_rw_lock_exclusive(lck);
1164 else
1165 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1166}
1167
1168
1169/*
1170 * Routine: lck_rw_lock_shared_gen
1171 */
1172void
1173lck_rw_lock_shared_gen(
1174 lck_rw_t *lck)
1175{
1176 int i;
1177 wait_result_t res;
1178#if MACH_LDEBUG
1179 int decrementer;
1180#endif /* MACH_LDEBUG */
1181 boolean_t istate;
1182#if CONFIG_DTRACE
1183 uint64_t wait_interval = 0;
1184 int slept = 0;
1185 int readers_at_sleep;
1186#endif
1187
1188 istate = lck_interlock_lock(lck);
1189#if CONFIG_DTRACE
1190 readers_at_sleep = lck->lck_rw_shared_count;
1191#endif
1192
1193#if MACH_LDEBUG
1194 decrementer = DECREMENTER_TIMEOUT;
1195#endif /* MACH_LDEBUG */
1196 while ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1197 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1198
1199 i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0];
1200
1201 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START,
1202 (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, i, 0);
1203#if CONFIG_DTRACE
1204 if ((lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK]) && wait_interval == 0) {
1205 wait_interval = mach_absolute_time();
1206 } else {
1207 wait_interval = -1;
1208 }
1209#endif
1210
1211 if (i != 0) {
1212 lck_interlock_unlock(lck, istate);
1213#if MACH_LDEBUG
1214 if (!--decrementer)
1215 Debugger("timeout - wait no writers");
1216#endif /* MACH_LDEBUG */
1217 while (--i != 0 &&
1218 (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1219 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl))
1220 lck_rw_lock_pause(istate);
1221 istate = lck_interlock_lock(lck);
1222 }
1223
1224 if (lck->lck_rw_can_sleep &&
1225 (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1226 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1227 lck->lck_r_waiting = TRUE;
1228 res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1229 if (res == THREAD_WAITING) {
1230 lck_interlock_unlock(lck, istate);
1231 res = thread_block(THREAD_CONTINUE_NULL);
1232#if CONFIG_DTRACE
1233 slept = 1;
1234#endif
1235 istate = lck_interlock_lock(lck);
1236 }
1237 }
1238 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END,
1239 (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, res, 0);
1240 }
1241
1242 lck->lck_rw_shared_count++;
1243
1244 lck_interlock_unlock(lck, istate);
1245#if CONFIG_DTRACE
1246 if (wait_interval != 0 && wait_interval != (unsigned) -1) {
1247 if (slept == 0) {
1248 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1249 } else {
1250 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1251 mach_absolute_time() - wait_interval, 0,
1252 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1253 }
1254 }
1255 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1256#endif
1257}
1258
1259
1260/*
1261 * Routine: lck_rw_lock_shared_to_exclusive
1262 * Function:
1263 * Improves a read-only lock to one with
1264 * write permission. If another reader has
1265 * already requested an upgrade to a write lock,
1266 * no lock is held upon return.
1267 *
1268 * Returns FALSE if the upgrade *failed*.
1269 */
1270
1271boolean_t
1272lck_rw_lock_shared_to_exclusive(
1273 lck_rw_t *lck)
1274{
1275 int i;
1276 boolean_t do_wakeup = FALSE;
1277 wait_result_t res;
1278#if MACH_LDEBUG
1279 int decrementer;
1280#endif /* MACH_LDEBUG */
1281 boolean_t istate;
1282#if CONFIG_DTRACE
1283 uint64_t wait_interval = 0;
1284 int slept = 0;
1285 int readers_at_sleep = 0;
1286#endif
1287
1288 istate = lck_interlock_lock(lck);
1289
1290 lck->lck_rw_shared_count--;
1291
1292 if (lck->lck_rw_want_upgrade) {
1293 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START,
1294 (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1295
1296 /*
1297 * Someone else has requested upgrade.
1298 * Since we've released a read lock, wake
1299 * him up.
1300 */
1301 if (lck->lck_w_waiting && (lck->lck_rw_shared_count == 0)) {
1302 lck->lck_w_waiting = FALSE;
1303 do_wakeup = TRUE;
1304 }
1305
1306 lck_interlock_unlock(lck, istate);
1307
1308 if (do_wakeup)
1309 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1310
1311 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END,
1312 (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1313
1314 return (FALSE);
1315 }
1316
1317 lck->lck_rw_want_upgrade = TRUE;
1318
1319#if MACH_LDEBUG
1320 decrementer = DECREMENTER_TIMEOUT;
1321#endif /* MACH_LDEBUG */
1322 while (lck->lck_rw_shared_count != 0) {
1323#if CONFIG_DTRACE
1324 if (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] && wait_interval == 0) {
1325 wait_interval = mach_absolute_time();
1326 readers_at_sleep = lck->lck_rw_shared_count;
1327 } else {
1328 wait_interval = -1;
1329 }
1330#endif
1331 i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0];
1332
1333 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START,
1334 (int)lck, lck->lck_rw_shared_count, i, 0, 0);
1335
1336 if (i != 0) {
1337 lck_interlock_unlock(lck, istate);
1338#if MACH_LDEBUG
1339 if (!--decrementer)
1340 Debugger("timeout - lck_rw_shared_count");
1341#endif /* MACH_LDEBUG */
1342 while (--i != 0 && lck->lck_rw_shared_count != 0)
1343 lck_rw_lock_pause(istate);
1344 istate = lck_interlock_lock(lck);
1345 }
1346
1347 if (lck->lck_rw_can_sleep && lck->lck_rw_shared_count != 0) {
1348 lck->lck_w_waiting = TRUE;
1349 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1350 if (res == THREAD_WAITING) {
1351 lck_interlock_unlock(lck, istate);
1352 res = thread_block(THREAD_CONTINUE_NULL);
1353#if CONFIG_DTRACE
1354 slept = 1;
1355#endif
1356 istate = lck_interlock_lock(lck);
1357 }
1358 }
1359 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END,
1360 (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1361 }
1362
1363 lck_interlock_unlock(lck, istate);
1364#if CONFIG_DTRACE
1365 /*
1366 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1367 */
1368 if (wait_interval != 0 && wait_interval != (unsigned) -1 && readers_at_sleep) {
1369 if (slept == 0) {
1370 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1371 } else {
1372 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1373 mach_absolute_time() - wait_interval, 1,
1374 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1375 }
1376 }
1377
1378 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1379#endif
1380 return (TRUE);
1381}
1382
1383/*
1384 * Routine: lck_rw_lock_exclusive_to_shared
1385 */
1386void
1387lck_rw_lock_exclusive_to_shared(
1388 lck_rw_t *lck)
1389{
1390 boolean_t wakeup_readers = FALSE;
1391 boolean_t wakeup_writers = FALSE;
1392 boolean_t istate;
1393
1394 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1395 (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1396
1397 istate = lck_interlock_lock(lck);
1398
1399 lck->lck_rw_shared_count++;
1400 if (lck->lck_rw_want_upgrade)
1401 lck->lck_rw_want_upgrade = FALSE;
1402 else
1403 lck->lck_rw_want_write = FALSE;
1404
1405 if (lck->lck_w_waiting) {
1406 lck->lck_w_waiting = FALSE;
1407 wakeup_writers = TRUE;
1408 }
1409 if (!(lck->lck_rw_priv_excl && wakeup_writers == TRUE) &&
1410 lck->lck_r_waiting) {
1411 lck->lck_r_waiting = FALSE;
1412 wakeup_readers = TRUE;
1413 }
1414
1415 lck_interlock_unlock(lck, istate);
1416
1417 if (wakeup_readers)
1418 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1419 if (wakeup_writers)
1420 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1421
1422 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1423 (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1424
1425#if CONFIG_DTRACE
1426 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1427#endif
1428}
1429
1430
1431/*
1432 * Routine: lck_rw_try_lock
1433 */
1434boolean_t
1435lck_rw_try_lock(
1436 lck_rw_t *lck,
1437 lck_rw_type_t lck_rw_type)
1438{
1439 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1440 return(lck_rw_try_lock_shared(lck));
1441 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1442 return(lck_rw_try_lock_exclusive(lck));
1443 else
1444 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1445 return(FALSE);
1446}
1447
1448/*
1449 * Routine: lck_rw_try_lock_exclusive
1450 * Function:
1451 * Tries to get a write lock.
1452 *
1453 * Returns FALSE if the lock is not held on return.
1454 */
1455
1456boolean_t
1457lck_rw_try_lock_exclusive(
1458 lck_rw_t *lck)
1459{
1460 boolean_t istate;
1461
1462 istate = lck_interlock_lock(lck);
1463
1464 if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || lck->lck_rw_shared_count) {
1465 /*
1466 * Can't get lock.
1467 */
1468 lck_interlock_unlock(lck, istate);
1469 return(FALSE);
1470 }
1471
1472 /*
1473 * Have lock.
1474 */
1475
1476 lck->lck_rw_want_write = TRUE;
1477
1478 lck_interlock_unlock(lck, istate);
1479
1480#if CONFIG_DTRACE
1481 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lck, 1);
1482#endif
1483 return(TRUE);
1484}
1485
1486/*
1487 * Routine: lck_rw_try_lock_shared
1488 * Function:
1489 * Tries to get a read lock.
1490 *
1491 * Returns FALSE if the lock is not held on return.
1492 */
1493
1494boolean_t
1495lck_rw_try_lock_shared(
1496 lck_rw_t *lck)
1497{
1498 boolean_t istate;
1499
1500 istate = lck_interlock_lock(lck);
1501/* No reader priority check here... */
1502 if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) {
1503 lck_interlock_unlock(lck, istate);
1504 return(FALSE);
1505 }
1506
1507 lck->lck_rw_shared_count++;
1508
1509 lck_interlock_unlock(lck, istate);
1510
1511#if CONFIG_DTRACE
1512 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lck, 0);
1513#endif
1514 return(TRUE);
1515}
1516
1517void
1518lck_rw_assert(
1519 lck_rw_t *lck,
1520 unsigned int type)
1521{
1522 switch (type) {
1523 case LCK_RW_ASSERT_SHARED:
1524 if (lck->lck_rw_shared_count != 0) {
1525 return;
1526 }
1527 break;
1528 case LCK_RW_ASSERT_EXCLUSIVE:
1529 if ((lck->lck_rw_want_write ||
1530 lck->lck_rw_want_upgrade) &&
1531 lck->lck_rw_shared_count == 0) {
1532 return;
1533 }
1534 break;
1535 case LCK_RW_ASSERT_HELD:
1536 if (lck->lck_rw_want_write ||
1537 lck->lck_rw_want_upgrade ||
1538 lck->lck_rw_shared_count != 0) {
1539 return;
1540 }
1541 break;
1542 default:
1543 break;
1544 }
1545
1546 panic("rw lock (%p) not held (mode=%u)\n", lck, type);
1547}
1548
1549/*
1550 * Routine: lck_mtx_alloc_init
1551 */
1552lck_mtx_t *
1553lck_mtx_alloc_init(
1554 lck_grp_t *grp,
1555 lck_attr_t *attr)
1556{
1557 lck_mtx_t *lck;
1558
1559 if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
1560 lck_mtx_init(lck, grp, attr);
1561
1562 return(lck);
1563}
1564
1565/*
1566 * Routine: lck_mtx_free
1567 */
1568void
1569lck_mtx_free(
1570 lck_mtx_t *lck,
1571 lck_grp_t *grp)
1572{
1573 lck_mtx_destroy(lck, grp);
1574 kfree(lck, sizeof(lck_mtx_t));
1575}
1576
1577/*
1578 * Routine: lck_mtx_ext_init
1579 */
1580static void
1581lck_mtx_ext_init(
1582 lck_mtx_ext_t *lck,
1583 lck_grp_t *grp,
1584 lck_attr_t *attr)
1585{
1586 bzero((void *)lck, sizeof(lck_mtx_ext_t));
1587
1588 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1589 lck->lck_mtx_deb.type = MUTEX_TAG;
1590 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
1591 }
1592
1593 lck->lck_mtx_grp = grp;
1594
1595 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
1596 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
1597}
1598
1599/*
1600 * Routine: lck_mtx_init
1601 */
1602void
1603lck_mtx_init(
1604 lck_mtx_t *lck,
1605 lck_grp_t *grp,
1606 lck_attr_t *attr)
1607{
1608 lck_mtx_ext_t *lck_ext;
1609 lck_attr_t *lck_attr;
1610
1611 if (attr != LCK_ATTR_NULL)
1612 lck_attr = attr;
1613 else
1614 lck_attr = &LockDefaultLckAttr;
1615
1616 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1617 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
1618 lck_mtx_ext_init(lck_ext, grp, lck_attr);
1619 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1620 lck->lck_mtx_ptr = lck_ext;
1621 }
1622 } else {
1623 lck->lck_mtx_ilk = 0;
1624 lck->lck_mtx_locked = 0;
1625 lck->lck_mtx_waiters = 0;
1626 lck->lck_mtx_pri = 0;
1627 }
1628 lck_grp_reference(grp);
1629 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1630}
1631
1632/*
1633 * Routine: lck_mtx_init_ext
1634 */
1635void
1636lck_mtx_init_ext(
1637 lck_mtx_t *lck,
1638 lck_mtx_ext_t *lck_ext,
1639 lck_grp_t *grp,
1640 lck_attr_t *attr)
1641{
1642 lck_attr_t *lck_attr;
1643
1644 if (attr != LCK_ATTR_NULL)
1645 lck_attr = attr;
1646 else
1647 lck_attr = &LockDefaultLckAttr;
1648
1649 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1650 lck_mtx_ext_init(lck_ext, grp, lck_attr);
1651 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1652 lck->lck_mtx_ptr = lck_ext;
1653 } else {
1654 lck->lck_mtx_ilk = 0;
1655 lck->lck_mtx_locked = 0;
1656 lck->lck_mtx_waiters = 0;
1657 lck->lck_mtx_pri = 0;
1658 }
1659 lck_grp_reference(grp);
1660 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1661}
1662
1663/*
1664 * Routine: lck_mtx_destroy
1665 */
1666void
1667lck_mtx_destroy(
1668 lck_mtx_t *lck,
1669 lck_grp_t *grp)
1670{
1671 boolean_t lck_is_indirect;
1672
1673 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
1674 return;
1675 lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1676 lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
1677 if (lck_is_indirect)
1678 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
1679 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1680 lck_grp_deallocate(grp);
1681 return;
1682}
1683
1684/*
1685 * Routine: lck_mtx_lock_spinwait
1686 *
1687 * Invoked trying to acquire a mutex when there is contention but
1688 * the holder is running on another processor. We spin for up to a maximum
1689 * time waiting for the lock to be released.
1690 *
1691 * Called with the interlock unlocked.
1692 */
1693void
1694lck_mtx_lock_spinwait(
1695 lck_mtx_t *lck)
1696{
1697 thread_t holder;
1698 volatile lck_mtx_t *mutex;
1699 uint64_t deadline;
1700
1701 if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
1702 mutex = lck;
1703 else
1704 mutex = &lck->lck_mtx_ptr->lck_mtx;
1705
1706 KERNEL_DEBUG(
1707 MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN) | DBG_FUNC_NONE,
1708 (int)lck, (int)mutex->lck_mtx_locked, 0, 0, 0);
1709
1710 deadline = mach_absolute_time() + MutexSpin;
1711 /*
1712 * Spin while:
1713 * - mutex is locked, and
1714 * - its locked as a spin lock, or
1715 * - owner is running on another processor, and
1716 * - owner (processor) is not idling, and
1717 * - we haven't spun for long enough.
1718 */
1719 while ((holder = (thread_t) mutex->lck_mtx_locked) != NULL) {
1720 if ((holder == (thread_t)MUTEX_LOCKED_AS_SPIN) ||
1721 ((holder->machine.specFlags & OnProc) != 0 &&
1722 (holder->state & TH_IDLE) == 0 &&
1723 mach_absolute_time() < deadline)) {
1724 cpu_pause();
1725 continue;
1726 }
1727 break;
1728 }
1729#if CONFIG_DTRACE
1730 /*
1731 * We've already kept a count via deadline of how long we spun.
1732 * If dtrace is active, then we compute backwards to decide how
1733 * long we spun.
1734 *
1735 * Note that we record a different probe id depending on whether
1736 * this is a direct or indirect mutex. This allows us to
1737 * penalize only lock groups that have debug/stats enabled
1738 * with dtrace processing if desired.
1739 */
1740 if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) {
1741 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lck,
1742 mach_absolute_time() - (deadline - MutexSpin));
1743 } else {
1744 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lck,
1745 mach_absolute_time() - (deadline - MutexSpin));
1746 }
1747 /* The lockstat acquire event is recorded by the assembly code beneath us. */
1748#endif
1749}
1750
1751/*
1752 * Called from assembly code when a destroyed mutex is detected
1753 * during a lock/unlock/try/convert
1754 */
1755
1756void
1757lck_mtx_interlock_panic(
1758 lck_mtx_t *lck)
1759{
1760 panic("trying to interlock destroyed mutex %p", lck);
1761}
1762
1763
1764#if MACH_KDB
1765
1766void
1767db_show_one_lock(
1768 lock_t *lock)
1769{
1770 db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ",
1771 lock->lck_rw_shared_count,
1772 lock->lck_rw_want_upgrade ? "" : "!",
1773 lock->lck_rw_want_write ? "" : "!");
1774 db_printf("%swaiting, %scan_sleep\n",
1775 (lock->lck_r_waiting || lock->lck_w_waiting) ? "" : "!",
1776 lock->lck_rw_can_sleep ? "" : "!");
1777 db_printf("Interlock:\n");
1778 db_show_one_simple_lock((db_expr_t) ((vm_offset_t)simple_lock_addr(lock->lck_rw_interlock)),
1779 TRUE, (db_expr_t)0, (char *)0);
1780}
1781
1782#endif /* MACH_KDB */
1783
1784/*
1785 * The C portion of the mutex package. These routines are only invoked
1786 * if the optimized assembler routines can't do the work.
1787 */
1788
1789/*
1790 * Routine: lock_alloc
1791 * Function:
1792 * Allocate a mutex for external users who cannot
1793 * hard-code the structure definition into their
1794 * objects.
1795 * For now just use kalloc, but a zone is probably
1796 * warranted.
1797 */
1798mutex_t *
1799mutex_alloc(
1800 unsigned short tag)
1801{
1802 mutex_t *m;
1803
1804 if ((m = (mutex_t *)kalloc(sizeof(mutex_t))) != 0)
1805 mutex_init(m, tag);
1806 return(m);
1807}
1808
1809/*
1810 * Routine: mutex_free
1811 * Function:
1812 * Free a mutex allocated for external users.
1813 * For now just use kfree, but a zone is probably
1814 * warranted.
1815 */
1816void
1817mutex_free(
1818 mutex_t *m)
1819{
1820 kfree(m, sizeof(mutex_t));
1821}
1822
1823
1824#if MACH_KDB
1825/*
1826 * Routines to print out simple_locks and mutexes in a nicely-formatted
1827 * fashion.
1828 */
1829
1830const char *simple_lock_labels = "ENTRY ILK THREAD DURATION CALLER";
1831const char *mutex_labels = "ENTRY LOCKED WAITERS THREAD CALLER";
1832
1833void
1834db_show_one_simple_lock (
1835 db_expr_t addr,
1836 boolean_t have_addr,
1837 __unused db_expr_t count,
1838 __unused char * modif)
1839{
1840 simple_lock_t saddr = (simple_lock_t) ((vm_offset_t) addr);
1841
1842 if (saddr == (simple_lock_t)0 || !have_addr) {
1843 db_error ("No simple_lock\n");
1844 }
1845#if USLOCK_DEBUG
1846 else if (saddr->lock_type != USLOCK_TAG)
1847 db_error ("Not a simple_lock\n");
1848#endif /* USLOCK_DEBUG */
1849
1850 db_printf ("%s\n", simple_lock_labels);
1851 db_print_simple_lock (saddr);
1852}
1853
1854void
1855db_print_simple_lock (
1856 simple_lock_t addr)
1857{
1858
1859 db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock));
1860#if USLOCK_DEBUG
1861 db_printf (" %08x", addr->debug.lock_thread);
1862 db_printf (" %08x ", addr->debug.duration[1]);
1863 db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY);
1864#endif /* USLOCK_DEBUG */
1865 db_printf ("\n");
1866}
1867
1868void
1869db_show_one_mutex (
1870 db_expr_t addr,
1871 boolean_t have_addr,
1872 __unused db_expr_t count,
1873 __unused char * modif)
1874{
1875 mutex_t * maddr = (mutex_t *)((vm_offset_t) addr);
1876
1877 if (maddr == (mutex_t *)0 || !have_addr)
1878 db_error ("No mutex\n");
1879#if MACH_LDEBUG
1880 else if (maddr->type != MUTEX_TAG)
1881 db_error ("Not a mutex\n");
1882#endif /* MACH_LDEBUG */
1883
1884 db_printf ("%s\n", mutex_labels);
1885 db_print_mutex (maddr);
1886}
1887
1888void
1889db_print_mutex (
1890 mutex_t * addr)
1891{
1892 db_printf ("%08x %6d %7d",
1893 addr, *addr, addr->lck_mtx.lck_mtx_waiters);
1894#if MACH_LDEBUG
1895 db_printf (" %08x ", addr->thread);
1896 db_printsym (addr->pc, DB_STGY_ANY);
1897#endif /* MACH_LDEBUG */
1898 db_printf ("\n");
1899}
1900
1901#endif /* MACH_KDB */