]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_lockf.c
xnu-6153.141.1.tar.gz
[apple/xnu.git] / bsd / kern / kern_lockf.c
1 /*
2 * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * This code is derived from software contributed to Berkeley by
33 * Scooter Morris at Genentech Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
60 */
61
62 #include <sys/cdefs.h>
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/lock.h>
67 #include <sys/mount.h>
68 #include <sys/proc.h>
69 #include <sys/signalvar.h>
70 #include <sys/unistd.h>
71 #include <sys/user.h>
72 #include <sys/vnode.h>
73 #include <sys/vnode_internal.h>
74 #include <sys/vnode_if.h>
75 #include <sys/malloc.h>
76 #include <sys/fcntl.h>
77 #include <sys/lockf.h>
78 #include <sys/sdt.h>
79 #include <kern/policy_internal.h>
80
81 #include <sys/file_internal.h>
82
83 #if (DEVELOPMENT || DEBUG)
84 #define LOCKF_DEBUGGING 1
85 #endif
86
87 #ifdef LOCKF_DEBUGGING
88 #include <sys/sysctl.h>
89 void lf_print(const char *tag, struct lockf *lock);
90 void lf_printlist(const char *tag, struct lockf *lock);
91
92 #define LF_DBG_LOCKOP (1 << 0) /* setlk, getlk, clearlk */
93 #define LF_DBG_LIST (1 << 1) /* split, coalesce */
94 #define LF_DBG_IMPINH (1 << 2) /* importance inheritance */
95 #define LF_DBG_TRACE (1 << 3) /* errors, exit */
96 #define LF_DBG_DEADLOCK (1 << 4) /* deadlock detection */
97
98 static int lockf_debug = 0; /* was 2, could be 3 ;-) */
99 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
100
101 /*
102 * If there is no mask bit selector, or there is one, and the selector is
103 * set, then output the debugging diagnostic.
104 */
105 #define LOCKF_DEBUG(mask, ...) \
106 do { \
107 if (!(mask) || ((mask) & lockf_debug)) { \
108 printf("%s>", __FUNCTION__); \
109 printf(__VA_ARGS__); \
110 } \
111 } while(0)
112
113 #define LOCKF_DEBUGP(mask) \
114 ({ \
115 ((mask) & lockf_debug); \
116 })
117 #else /* !LOCKF_DEBUGGING */
118 #define LOCKF_DEBUG(mask, ...) /* mask */
119 #endif /* !LOCKF_DEBUGGING */
120
121 MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
122
123 #define NOLOCKF (struct lockf *)0
124 #define SELF 0x1
125 #define OTHERS 0x2
126 #define OFF_MAX 0x7fffffffffffffffULL /* max off_t */
127
128 /*
129 * Overlapping lock states
130 */
131 typedef enum {
132 OVERLAP_NONE = 0,
133 OVERLAP_EQUALS_LOCK,
134 OVERLAP_CONTAINS_LOCK,
135 OVERLAP_CONTAINED_BY_LOCK,
136 OVERLAP_STARTS_BEFORE_LOCK,
137 OVERLAP_ENDS_AFTER_LOCK
138 } overlap_t;
139
140 static int lf_clearlock(struct lockf *);
141 static overlap_t lf_findoverlap(struct lockf *,
142 struct lockf *, int, struct lockf ***, struct lockf **);
143 static struct lockf *lf_getblock(struct lockf *, pid_t);
144 static int lf_getlock(struct lockf *, struct flock *, pid_t);
145 static int lf_setlock(struct lockf *, struct timespec *);
146 static int lf_split(struct lockf *, struct lockf *);
147 static void lf_wakelock(struct lockf *, boolean_t);
148 #if IMPORTANCE_INHERITANCE
149 static void lf_hold_assertion(task_t, struct lockf *);
150 static void lf_jump_to_queue_head(struct lockf *, struct lockf *);
151 static void lf_drop_assertion(struct lockf *);
152 static void lf_boost_blocking_proc(struct lockf *, struct lockf *);
153 static void lf_adjust_assertion(struct lockf *block);
154 #endif /* IMPORTANCE_INHERITANCE */
155
156 static lck_mtx_t lf_dead_lock;
157 static lck_grp_t *lf_dead_lock_grp;
158
159 void
160 lf_init(void)
161 {
162 lf_dead_lock_grp = lck_grp_alloc_init("lf_dead_lock", LCK_GRP_ATTR_NULL);
163 lck_mtx_init(&lf_dead_lock, lf_dead_lock_grp, LCK_ATTR_NULL);
164 }
165
166 /*
167 * lf_advlock
168 *
169 * Description: Advisory record locking support
170 *
171 * Parameters: ap Argument pointer to a vnop_advlock_args
172 * argument descriptor structure for the
173 * lock operation to be attempted.
174 *
175 * Returns: 0 Success
176 * EOVERFLOW
177 * EINVAL
178 * ENOLCK Number of locked regions exceeds limit
179 * lf_setlock:EAGAIN
180 * lf_setlock:EDEADLK
181 * lf_setlock:EINTR
182 * lf_setlock:ENOLCK
183 * lf_setlock:ETIMEDOUT
184 * lf_clearlock:ENOLCK
185 * vnode_size:???
186 *
187 * Notes: We return ENOLCK when we run out of memory to support locks; as
188 * such, there is no specific expectation limit other than the
189 * amount of available resources.
190 */
191 int
192 lf_advlock(struct vnop_advlock_args *ap)
193 {
194 struct vnode *vp = ap->a_vp;
195 struct flock *fl = ap->a_fl;
196 vfs_context_t context = ap->a_context;
197 struct lockf *lock;
198 off_t start, end, oadd;
199 u_quad_t size;
200 int error;
201 struct lockf **head = &vp->v_lockf;
202
203 /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
204
205 /*
206 * Avoid the common case of unlocking when inode has no locks.
207 */
208 if (*head == (struct lockf *)0) {
209 if (ap->a_op != F_SETLK) {
210 fl->l_type = F_UNLCK;
211 LOCKF_DEBUG(LF_DBG_TRACE,
212 "lf_advlock: '%s' unlock without lock\n",
213 vfs_context_proc(context)->p_comm);
214 return 0;
215 }
216 }
217
218 /*
219 * Convert the flock structure into a start and end.
220 */
221 switch (fl->l_whence) {
222 case SEEK_SET:
223 case SEEK_CUR:
224 /*
225 * Caller is responsible for adding any necessary offset
226 * when SEEK_CUR is used.
227 */
228 start = fl->l_start;
229 break;
230
231 case SEEK_END:
232
233 /*
234 * It's OK to cast the u_quad_t to and off_t here, since they
235 * are the same storage size, and the value of the returned
236 * contents will never overflow into the sign bit. We need to
237 * do this because we will use size to force range checks.
238 */
239 if ((error = vnode_size(vp, (off_t *)&size, context))) {
240 LOCKF_DEBUG(LF_DBG_TRACE,
241 "lf_advlock: vnode_getattr failed: %d\n", error);
242 return error;
243 }
244
245 if (size > OFF_MAX ||
246 (fl->l_start > 0 &&
247 size > (u_quad_t)(OFF_MAX - fl->l_start))) {
248 return EOVERFLOW;
249 }
250 start = size + fl->l_start;
251 break;
252
253 default:
254 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: unknown whence %d\n",
255 fl->l_whence);
256 return EINVAL;
257 }
258 if (start < 0) {
259 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: start < 0 (%qd)\n",
260 start);
261 return EINVAL;
262 }
263 if (fl->l_len < 0) {
264 if (start == 0) {
265 LOCKF_DEBUG(LF_DBG_TRACE,
266 "lf_advlock: len < 0 & start == 0\n");
267 return EINVAL;
268 }
269 end = start - 1;
270 start += fl->l_len;
271 if (start < 0) {
272 LOCKF_DEBUG(LF_DBG_TRACE,
273 "lf_advlock: start < 0 (%qd)\n", start);
274 return EINVAL;
275 }
276 } else if (fl->l_len == 0) {
277 end = -1;
278 } else {
279 oadd = fl->l_len - 1;
280 if (oadd > (off_t)(OFF_MAX - start)) {
281 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: overflow\n");
282 return EOVERFLOW;
283 }
284 end = start + oadd;
285 }
286 /*
287 * Create the lockf structure
288 */
289 MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
290 if (lock == NULL) {
291 return ENOLCK;
292 }
293 lock->lf_start = start;
294 lock->lf_end = end;
295 lock->lf_id = ap->a_id;
296 lock->lf_vnode = vp;
297 lock->lf_type = fl->l_type;
298 lock->lf_head = head;
299 lock->lf_next = (struct lockf *)0;
300 TAILQ_INIT(&lock->lf_blkhd);
301 lock->lf_flags = ap->a_flags;
302 #if IMPORTANCE_INHERITANCE
303 lock->lf_boosted = LF_NOT_BOOSTED;
304 #endif
305 if (ap->a_flags & F_POSIX) {
306 lock->lf_owner = (struct proc *)lock->lf_id;
307 } else {
308 lock->lf_owner = NULL;
309 }
310
311 if (ap->a_flags & F_FLOCK) {
312 lock->lf_flags |= F_WAKE1_SAFE;
313 }
314
315 lck_mtx_lock(&vp->v_lock); /* protect the lockf list */
316 /*
317 * Do the requested operation.
318 */
319 switch (ap->a_op) {
320 case F_SETLK:
321 /*
322 * For F_OFD_* locks, lf_id is the fileglob.
323 * Record an "lf_owner" iff this is a confined fd
324 * i.e. it cannot escape this process and will be
325 * F_UNLCKed before the owner exits. (This is
326 * the implicit guarantee needed to ensure lf_owner
327 * remains a valid reference here.)
328 */
329 if (ap->a_flags & F_OFD_LOCK) {
330 struct fileglob *fg = (void *)lock->lf_id;
331 if (fg->fg_lflags & FG_CONFINED) {
332 lock->lf_owner = current_proc();
333 }
334 }
335 error = lf_setlock(lock, ap->a_timeout);
336 break;
337
338 case F_UNLCK:
339 error = lf_clearlock(lock);
340 FREE(lock, M_LOCKF);
341 break;
342
343 case F_GETLK:
344 error = lf_getlock(lock, fl, -1);
345 FREE(lock, M_LOCKF);
346 break;
347
348 case F_GETLKPID:
349 error = lf_getlock(lock, fl, fl->l_pid);
350 FREE(lock, M_LOCKF);
351 break;
352
353 default:
354 FREE(lock, M_LOCKF);
355 error = EINVAL;
356 break;
357 }
358 lck_mtx_unlock(&vp->v_lock); /* done manipulating the list */
359
360 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: normal exit: %d\n", error);
361 return error;
362 }
363
364 /*
365 * Empty the queue of msleeping requests for a lock on the given vnode.
366 * Called with the vnode already locked. Used for forced unmount, where
367 * a flock(2) invoker sleeping on a blocked lock holds an iocount reference
368 * that prevents the vnode from ever being drained. Force unmounting wins.
369 */
370 void
371 lf_abort_advlocks(vnode_t vp)
372 {
373 struct lockf *lock;
374
375 if ((lock = vp->v_lockf) == NULL) {
376 return;
377 }
378
379 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
380
381 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
382 struct lockf *tlock;
383
384 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
385 /*
386 * Setting this flag should cause all
387 * currently blocked F_SETLK request to
388 * return to userland with an errno.
389 */
390 tlock->lf_flags |= F_ABORT;
391 }
392 lf_wakelock(lock, TRUE);
393 }
394 }
395
396 /*
397 * Take any lock attempts which are currently blocked by a given lock ("from")
398 * and mark them as blocked by a different lock ("to"). Used in the case
399 * where a byte range currently occupied by "from" is to be occupied by "to."
400 */
401 static void
402 lf_move_blocked(struct lockf *to, struct lockf *from)
403 {
404 struct lockf *tlock;
405
406 TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
407 tlock->lf_next = to;
408 }
409
410 TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
411 }
412
413 /*
414 * lf_coalesce_adjacent
415 *
416 * Description: Helper function: when setting a lock, coalesce adjacent
417 * locks. Needed because adjacent locks are not overlapping,
418 * but POSIX requires that they be coalesced.
419 *
420 * Parameters: lock The new lock which may be adjacent
421 * to already locked regions, and which
422 * should therefore be coalesced with them
423 *
424 * Returns: <void>
425 */
426 static void
427 lf_coalesce_adjacent(struct lockf *lock)
428 {
429 struct lockf **lf = lock->lf_head;
430
431 while (*lf != NOLOCKF) {
432 /* reject locks that obviously could not be coalesced */
433 if ((*lf == lock) ||
434 ((*lf)->lf_id != lock->lf_id) ||
435 ((*lf)->lf_type != lock->lf_type)) {
436 lf = &(*lf)->lf_next;
437 continue;
438 }
439
440 /*
441 * NOTE: Assumes that if two locks are adjacent on the number line
442 * and belong to the same owner, then they are adjacent on the list.
443 */
444 if ((*lf)->lf_end != -1 &&
445 ((*lf)->lf_end + 1) == lock->lf_start) {
446 struct lockf *adjacent = *lf;
447
448 LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent previous\n");
449 lock->lf_start = (*lf)->lf_start;
450 *lf = lock;
451 lf = &(*lf)->lf_next;
452
453 lf_move_blocked(lock, adjacent);
454
455 FREE(adjacent, M_LOCKF);
456 continue;
457 }
458 /* If the lock starts adjacent to us, we can coalesce it */
459 if (lock->lf_end != -1 &&
460 (lock->lf_end + 1) == (*lf)->lf_start) {
461 struct lockf *adjacent = *lf;
462
463 LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent following\n");
464 lock->lf_end = (*lf)->lf_end;
465 lock->lf_next = (*lf)->lf_next;
466 lf = &lock->lf_next;
467
468 lf_move_blocked(lock, adjacent);
469
470 FREE(adjacent, M_LOCKF);
471 continue;
472 }
473
474 /* no matching conditions; go on to next lock */
475 lf = &(*lf)->lf_next;
476 }
477 }
478
479 /*
480 * lf_setlock
481 *
482 * Description: Set a byte-range lock.
483 *
484 * Parameters: lock The lock structure describing the lock
485 * to be set; allocated by the caller, it
486 * will be linked into the lock list if
487 * the set is successful, and freed if the
488 * set is unsuccessful.
489 *
490 * timeout Timeout specified in the case of
491 * SETLKWTIMEOUT.
492 *
493 * Returns: 0 Success
494 * EAGAIN
495 * EDEADLK
496 * lf_split:ENOLCK
497 * lf_clearlock:ENOLCK
498 * msleep:EINTR
499 * msleep:ETIMEDOUT
500 *
501 * Notes: We add the lock to the provisional lock list. We do not
502 * coalesce at this time; this has implications for other lock
503 * requestors in the blocker search mechanism.
504 */
505 static int
506 lf_setlock(struct lockf *lock, struct timespec *timeout)
507 {
508 struct lockf *block;
509 struct lockf **head = lock->lf_head;
510 struct lockf **prev, *overlap, *ltmp;
511 static const char lockstr[] = "lockf";
512 int priority, needtolink, error;
513 struct vnode *vp = lock->lf_vnode;
514 overlap_t ovcase;
515
516 #ifdef LOCKF_DEBUGGING
517 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
518 lf_print("lf_setlock", lock);
519 lf_printlist("lf_setlock(in)", lock);
520 }
521 #endif /* LOCKF_DEBUGGING */
522 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p Looking for deadlock, vnode %p\n", lock, lock->lf_vnode);
523
524 /*
525 * Set the priority
526 */
527 priority = PLOCK;
528 if (lock->lf_type == F_WRLCK) {
529 priority += 4;
530 }
531 priority |= PCATCH;
532 scan:
533 /*
534 * Scan lock list for this file looking for locks that would block us.
535 */
536 while ((block = lf_getblock(lock, -1))) {
537 /*
538 * Free the structure and return if nonblocking.
539 */
540 if ((lock->lf_flags & F_WAIT) == 0) {
541 DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
542 FREE(lock, M_LOCKF);
543 return EAGAIN;
544 }
545
546 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p found blocking lock %p\n", lock, block);
547
548 /*
549 * We are blocked. Since flock style locks cover
550 * the whole file, there is no chance for deadlock.
551 *
552 * OFD byte-range locks currently do NOT support
553 * deadlock detection.
554 *
555 * For POSIX byte-range locks we must check for deadlock.
556 *
557 * Deadlock detection is done by looking through the
558 * wait channels to see if there are any cycles that
559 * involve us.
560 */
561 if ((lock->lf_flags & F_POSIX) &&
562 (block->lf_flags & F_POSIX)) {
563 lck_mtx_lock(&lf_dead_lock);
564
565 /* The blocked process is waiting on something */
566 struct proc *wproc = block->lf_owner;
567 proc_lock(wproc);
568
569 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(wproc));
570
571 struct uthread *ut;
572 TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
573 /*
574 * If the thread is (a) asleep (uu_wchan != 0)
575 * and (b) in this code (uu_wmesg == lockstr)
576 * then check to see if the lock is blocked behind
577 * someone blocked behind us.
578 *
579 * Note: (i) vp->v_lock is held, preventing other
580 * threads from mutating the blocking list for our vnode.
581 * and (ii) the proc_lock is held i.e the thread list
582 * is stable.
583 *
584 * HOWEVER some thread in wproc might be sleeping on a lockf
585 * structure for a different vnode, and be woken at any
586 * time. Thus the waitblock list could mutate while
587 * it's being inspected by this thread, and what
588 * ut->uu_wchan was just pointing at could even be freed.
589 *
590 * Nevertheless this is safe here because of lf_dead_lock; if
591 * any thread blocked with uu_wmesg == lockstr wakes (see below)
592 * it will try to acquire lf_dead_lock which is already held
593 * here. Holding that lock prevents the lockf structure being
594 * pointed at by ut->uu_wchan from going away. Thus the vnode
595 * involved can be found and locked, and the corresponding
596 * blocking chain can then be examined safely.
597 */
598 const struct lockf *waitblock = (const void *)ut->uu_wchan;
599 if ((waitblock != NULL) && (ut->uu_wmesg == lockstr)) {
600 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode);
601
602 vnode_t othervp = NULL;
603 if (waitblock->lf_vnode != vp) {
604 /*
605 * This thread in wproc is waiting for a lock
606 * on a different vnode; grab the lock on it
607 * that protects lf_next while we examine it.
608 */
609 othervp = waitblock->lf_vnode;
610 if (!lck_mtx_try_lock(&othervp->v_lock)) {
611 /*
612 * avoid kernel deadlock: drop all
613 * locks, pause for a bit to let the
614 * other thread do what it needs to do,
615 * then (because we drop and retake
616 * v_lock) retry the scan.
617 */
618 proc_unlock(wproc);
619 lck_mtx_unlock(&lf_dead_lock);
620 static struct timespec ts = {
621 .tv_sec = 0,
622 .tv_nsec = 2 * NSEC_PER_MSEC,
623 };
624 static const char pausestr[] = "lockf:pause";
625 (void) msleep(lock, &vp->v_lock, priority, pausestr, &ts);
626 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p contention for vp %p => restart\n", lock, othervp);
627 goto scan;
628 }
629 }
630
631 /*
632 * Get the lock blocking the lock
633 * which would block us, and make
634 * certain it hasn't become unblocked
635 * (been granted, e.g. between the time
636 * we called lf_getblock, and the time
637 * we successfully acquired the
638 * proc_lock).
639 */
640 const struct lockf *nextblock = waitblock->lf_next;
641 if (nextblock == NULL) {
642 if (othervp) {
643 lck_mtx_unlock(&othervp->v_lock);
644 }
645 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p with waitblock %p and no lf_next; othervp %p\n", lock, waitblock, othervp);
646 continue;
647 }
648 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, nextblock, nextblock->lf_vnode);
649
650 /*
651 * Make sure it's an advisory range
652 * lock and not any other kind of lock;
653 * if we mix lock types, it's our own
654 * fault.
655 */
656 if ((nextblock->lf_flags & F_POSIX) == 0) {
657 if (othervp) {
658 lck_mtx_unlock(&othervp->v_lock);
659 }
660 continue;
661 }
662
663 /*
664 * If the owner of the lock that's
665 * blocking a lock that's blocking us
666 * getting the requested lock, then we
667 * would deadlock, so error out.
668 */
669 struct proc *bproc = nextblock->lf_owner;
670 const boolean_t deadlocked = bproc == lock->lf_owner;
671
672 if (othervp) {
673 lck_mtx_unlock(&othervp->v_lock);
674 }
675 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(bproc));
676 if (deadlocked) {
677 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock);
678 proc_unlock(wproc);
679 lck_mtx_unlock(&lf_dead_lock);
680 FREE(lock, M_LOCKF);
681 return EDEADLK;
682 }
683 }
684 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p bottom of thread loop\n", lock);
685 }
686 proc_unlock(wproc);
687 lck_mtx_unlock(&lf_dead_lock);
688 }
689
690 /*
691 * For flock type locks, we must first remove
692 * any shared locks that we hold before we sleep
693 * waiting for an exclusive lock.
694 */
695 if ((lock->lf_flags & F_FLOCK) &&
696 lock->lf_type == F_WRLCK) {
697 lock->lf_type = F_UNLCK;
698 if ((error = lf_clearlock(lock)) != 0) {
699 FREE(lock, M_LOCKF);
700 return error;
701 }
702 lock->lf_type = F_WRLCK;
703 }
704 /*
705 * Add our lock to the blocked list and sleep until we're free.
706 * Remember who blocked us (for deadlock detection).
707 */
708 lock->lf_next = block;
709 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
710
711 if (!(lock->lf_flags & F_FLOCK)) {
712 block->lf_flags &= ~F_WAKE1_SAFE;
713 }
714
715 #if IMPORTANCE_INHERITANCE
716 /*
717 * Importance donation is done only for cases where the
718 * owning task can be unambiguously determined.
719 *
720 * POSIX type locks are not inherited by child processes;
721 * we maintain a 1:1 mapping between a lock and its owning
722 * process.
723 *
724 * Flock type locks are inherited across fork() and there is
725 * no 1:1 mapping in the general case. However, the fileglobs
726 * used by OFD locks *may* be confined to the process that
727 * created them, and thus have an "owner", in which case
728 * we also attempt importance donation.
729 */
730 if ((lock->lf_flags & block->lf_flags & F_POSIX) != 0) {
731 lf_boost_blocking_proc(lock, block);
732 } else if ((lock->lf_flags & block->lf_flags & F_OFD_LOCK) &&
733 lock->lf_owner != block->lf_owner &&
734 NULL != lock->lf_owner && NULL != block->lf_owner) {
735 lf_boost_blocking_proc(lock, block);
736 }
737 #endif /* IMPORTANCE_INHERITANCE */
738
739 #ifdef LOCKF_DEBUGGING
740 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
741 lf_print("lf_setlock: blocking on", block);
742 lf_printlist("lf_setlock(block)", block);
743 }
744 #endif /* LOCKF_DEBUGGING */
745 DTRACE_FSINFO(advlock__wait, vnode_t, vp);
746
747 if (lock->lf_flags & F_POSIX) {
748 error = msleep(lock, &vp->v_lock, priority, lockstr, timeout);
749 /*
750 * Ensure that 'lock' doesn't get mutated or freed if a
751 * wakeup occurs while hunting for deadlocks (and holding
752 * lf_dead_lock - see above)
753 */
754 lck_mtx_lock(&lf_dead_lock);
755 lck_mtx_unlock(&lf_dead_lock);
756 } else {
757 static const char lockstr_np[] = "lockf:np";
758 error = msleep(lock, &vp->v_lock, priority, lockstr_np, timeout);
759 }
760
761 if (error == 0 && (lock->lf_flags & F_ABORT) != 0) {
762 error = EBADF;
763 }
764
765 if (lock->lf_next) {
766 /*
767 * lf_wakelock() always sets wakelock->lf_next to
768 * NULL before a wakeup; so we've been woken early
769 * - perhaps by a debugger, signal or other event.
770 *
771 * Remove 'lock' from the block list (avoids double-add
772 * in the spurious case, which would create a cycle)
773 */
774 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
775 #if IMPORTANCE_INHERITANCE
776 /*
777 * Adjust the boost on lf_next.
778 */
779 lf_adjust_assertion(lock->lf_next);
780 #endif /* IMPORTANCE_INHERITANCE */
781 lock->lf_next = NULL;
782
783 if (error == 0) {
784 /*
785 * If this was a spurious wakeup, retry
786 */
787 printf("%s: spurious wakeup, retrying lock\n",
788 __func__);
789 continue;
790 }
791 }
792
793 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
794 if ((block = lf_getblock(lock, -1)) != NULL) {
795 lf_move_blocked(block, lock);
796 }
797 }
798
799 if (error) {
800 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
801 lf_wakelock(lock, TRUE);
802 }
803 FREE(lock, M_LOCKF);
804 /* Return ETIMEDOUT if timeout occoured. */
805 if (error == EWOULDBLOCK) {
806 error = ETIMEDOUT;
807 }
808 return error;
809 }
810 }
811
812 /*
813 * No blocks!! Add the lock. Note that we will
814 * downgrade or upgrade any overlapping locks this
815 * process already owns.
816 *
817 * Skip over locks owned by other processes.
818 * Handle any locks that overlap and are owned by ourselves.
819 */
820 prev = head;
821 block = *head;
822 needtolink = 1;
823 for (;;) {
824 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
825 if (ovcase) {
826 block = overlap->lf_next;
827 }
828 /*
829 * Six cases:
830 * 0) no overlap
831 * 1) overlap == lock
832 * 2) overlap contains lock
833 * 3) lock contains overlap
834 * 4) overlap starts before lock
835 * 5) overlap ends after lock
836 */
837 switch (ovcase) {
838 case OVERLAP_NONE:
839 if (needtolink) {
840 *prev = lock;
841 lock->lf_next = overlap;
842 }
843 break;
844
845 case OVERLAP_EQUALS_LOCK:
846 /*
847 * If downgrading lock, others may be
848 * able to acquire it.
849 */
850 if (lock->lf_type == F_RDLCK &&
851 overlap->lf_type == F_WRLCK) {
852 lf_wakelock(overlap, TRUE);
853 }
854 overlap->lf_type = lock->lf_type;
855 FREE(lock, M_LOCKF);
856 lock = overlap; /* for lf_coalesce_adjacent() */
857 break;
858
859 case OVERLAP_CONTAINS_LOCK:
860 /*
861 * Check for common starting point and different types.
862 */
863 if (overlap->lf_type == lock->lf_type) {
864 FREE(lock, M_LOCKF);
865 lock = overlap; /* for lf_coalesce_adjacent() */
866 break;
867 }
868 if (overlap->lf_start == lock->lf_start) {
869 *prev = lock;
870 lock->lf_next = overlap;
871 overlap->lf_start = lock->lf_end + 1;
872 } else {
873 /*
874 * If we can't split the lock, we can't
875 * grant it. Claim a system limit for the
876 * resource shortage.
877 */
878 if (lf_split(overlap, lock)) {
879 FREE(lock, M_LOCKF);
880 return ENOLCK;
881 }
882 }
883 lf_wakelock(overlap, TRUE);
884 break;
885
886 case OVERLAP_CONTAINED_BY_LOCK:
887 /*
888 * If downgrading lock, others may be able to
889 * acquire it, otherwise take the list.
890 */
891 if (lock->lf_type == F_RDLCK &&
892 overlap->lf_type == F_WRLCK) {
893 lf_wakelock(overlap, TRUE);
894 } else {
895 while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
896 ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
897 TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
898 lf_block);
899 TAILQ_INSERT_TAIL(&lock->lf_blkhd,
900 ltmp, lf_block);
901 ltmp->lf_next = lock;
902 }
903 }
904 /*
905 * Add the new lock if necessary and delete the overlap.
906 */
907 if (needtolink) {
908 *prev = lock;
909 lock->lf_next = overlap->lf_next;
910 prev = &lock->lf_next;
911 needtolink = 0;
912 } else {
913 *prev = overlap->lf_next;
914 }
915 FREE(overlap, M_LOCKF);
916 continue;
917
918 case OVERLAP_STARTS_BEFORE_LOCK:
919 /*
920 * Add lock after overlap on the list.
921 */
922 lock->lf_next = overlap->lf_next;
923 overlap->lf_next = lock;
924 overlap->lf_end = lock->lf_start - 1;
925 prev = &lock->lf_next;
926 lf_wakelock(overlap, TRUE);
927 needtolink = 0;
928 continue;
929
930 case OVERLAP_ENDS_AFTER_LOCK:
931 /*
932 * Add the new lock before overlap.
933 */
934 if (needtolink) {
935 *prev = lock;
936 lock->lf_next = overlap;
937 }
938 overlap->lf_start = lock->lf_end + 1;
939 lf_wakelock(overlap, TRUE);
940 break;
941 }
942 break;
943 }
944 /* Coalesce adjacent locks with identical attributes */
945 lf_coalesce_adjacent(lock);
946 #ifdef LOCKF_DEBUGGING
947 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
948 lf_print("lf_setlock: got the lock", lock);
949 lf_printlist("lf_setlock(out)", lock);
950 }
951 #endif /* LOCKF_DEBUGGING */
952 return 0;
953 }
954
955
956 /*
957 * lf_clearlock
958 *
959 * Description: Remove a byte-range lock on an vnode. Generally, find the
960 * lock (or an overlap to that lock) and remove it (or shrink
961 * it), then wakeup anyone we can.
962 *
963 * Parameters: unlock The lock to clear
964 *
965 * Returns: 0 Success
966 * lf_split:ENOLCK
967 *
968 * Notes: A caller may unlock all the locks owned by the caller by
969 * specifying the entire file range; locks owned by other
970 * callers are not effected by this operation.
971 */
972 static int
973 lf_clearlock(struct lockf *unlock)
974 {
975 struct lockf **head = unlock->lf_head;
976 struct lockf *lf = *head;
977 struct lockf *overlap, **prev;
978 overlap_t ovcase;
979
980 if (lf == NOLOCKF) {
981 return 0;
982 }
983 #ifdef LOCKF_DEBUGGING
984 if (unlock->lf_type != F_UNLCK) {
985 panic("lf_clearlock: bad type");
986 }
987 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
988 lf_print("lf_clearlock", unlock);
989 }
990 #endif /* LOCKF_DEBUGGING */
991 prev = head;
992 while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
993 /*
994 * Wakeup the list of locks to be retried.
995 */
996 lf_wakelock(overlap, FALSE);
997 #if IMPORTANCE_INHERITANCE
998 if (overlap->lf_boosted == LF_BOOSTED) {
999 lf_drop_assertion(overlap);
1000 }
1001 #endif /* IMPORTANCE_INHERITANCE */
1002
1003 switch (ovcase) {
1004 case OVERLAP_NONE: /* satisfy compiler enum/switch */
1005 break;
1006
1007 case OVERLAP_EQUALS_LOCK:
1008 *prev = overlap->lf_next;
1009 FREE(overlap, M_LOCKF);
1010 break;
1011
1012 case OVERLAP_CONTAINS_LOCK: /* split it */
1013 if (overlap->lf_start == unlock->lf_start) {
1014 overlap->lf_start = unlock->lf_end + 1;
1015 break;
1016 }
1017 /*
1018 * If we can't split the lock, we can't grant it.
1019 * Claim a system limit for the resource shortage.
1020 */
1021 if (lf_split(overlap, unlock)) {
1022 return ENOLCK;
1023 }
1024 overlap->lf_next = unlock->lf_next;
1025 break;
1026
1027 case OVERLAP_CONTAINED_BY_LOCK:
1028 *prev = overlap->lf_next;
1029 lf = overlap->lf_next;
1030 FREE(overlap, M_LOCKF);
1031 continue;
1032
1033 case OVERLAP_STARTS_BEFORE_LOCK:
1034 overlap->lf_end = unlock->lf_start - 1;
1035 prev = &overlap->lf_next;
1036 lf = overlap->lf_next;
1037 continue;
1038
1039 case OVERLAP_ENDS_AFTER_LOCK:
1040 overlap->lf_start = unlock->lf_end + 1;
1041 break;
1042 }
1043 break;
1044 }
1045 #ifdef LOCKF_DEBUGGING
1046 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1047 lf_printlist("lf_clearlock", unlock);
1048 }
1049 #endif /* LOCKF_DEBUGGING */
1050 return 0;
1051 }
1052
1053
1054 /*
1055 * lf_getlock
1056 *
1057 * Description: Check whether there is a blocking lock, and if so return
1058 * its process identifier into the lock being requested.
1059 *
1060 * Parameters: lock Pointer to lock to test for blocks
1061 * fl Pointer to flock structure to receive
1062 * the blocking lock information, if a
1063 * blocking lock is found.
1064 * matchpid -1, or pid value to match in lookup.
1065 *
1066 * Returns: 0 Success
1067 *
1068 * Implicit Returns:
1069 * *fl Contents modified to reflect the
1070 * blocking lock, if one is found; not
1071 * modified otherwise
1072 *
1073 * Notes: fl->l_pid will be (-1) for file locks and will only be set to
1074 * the blocking process ID for advisory record locks.
1075 */
1076 static int
1077 lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
1078 {
1079 struct lockf *block;
1080
1081 #ifdef LOCKF_DEBUGGING
1082 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1083 lf_print("lf_getlock", lock);
1084 }
1085 #endif /* LOCKF_DEBUGGING */
1086
1087 if ((block = lf_getblock(lock, matchpid))) {
1088 fl->l_type = block->lf_type;
1089 fl->l_whence = SEEK_SET;
1090 fl->l_start = block->lf_start;
1091 if (block->lf_end == -1) {
1092 fl->l_len = 0;
1093 } else {
1094 fl->l_len = block->lf_end - block->lf_start + 1;
1095 }
1096 if (NULL != block->lf_owner) {
1097 /*
1098 * lf_owner is only non-NULL when the lock
1099 * "owner" can be unambiguously determined
1100 */
1101 fl->l_pid = proc_pid(block->lf_owner);
1102 } else {
1103 fl->l_pid = -1;
1104 }
1105 } else {
1106 fl->l_type = F_UNLCK;
1107 }
1108 return 0;
1109 }
1110
1111 /*
1112 * lf_getblock
1113 *
1114 * Description: Walk the list of locks for an inode and return the first
1115 * blocking lock. A lock is considered blocking if we are not
1116 * the lock owner; otherwise, we are permitted to upgrade or
1117 * downgrade it, and it's not considered blocking.
1118 *
1119 * Parameters: lock The lock for which we are interested
1120 * in obtaining the blocking lock, if any
1121 * matchpid -1, or pid value to match in lookup.
1122 *
1123 * Returns: NOLOCKF No blocking lock exists
1124 * !NOLOCKF The address of the blocking lock's
1125 * struct lockf.
1126 */
1127 static struct lockf *
1128 lf_getblock(struct lockf *lock, pid_t matchpid)
1129 {
1130 struct lockf **prev, *overlap, *lf = *(lock->lf_head);
1131
1132 for (prev = lock->lf_head;
1133 lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
1134 lf = overlap->lf_next) {
1135 /*
1136 * Found an overlap.
1137 *
1138 * If we're matching pids, and it's a record lock,
1139 * or it's an OFD lock on a process-confined fd,
1140 * but the pid doesn't match, then keep on looking ..
1141 */
1142 if (matchpid != -1 &&
1143 (overlap->lf_flags & (F_POSIX | F_OFD_LOCK)) != 0 &&
1144 proc_pid(overlap->lf_owner) != matchpid) {
1145 continue;
1146 }
1147
1148 /*
1149 * does it block us?
1150 */
1151 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) {
1152 return overlap;
1153 }
1154 }
1155 return NOLOCKF;
1156 }
1157
1158
1159 /*
1160 * lf_findoverlap
1161 *
1162 * Description: Walk the list of locks to find an overlapping lock (if any).
1163 *
1164 * Parameters: lf First lock on lock list
1165 * lock The lock we are checking for an overlap
1166 * check Check type
1167 * prev pointer to pointer pointer to contain
1168 * address of pointer to previous lock
1169 * pointer to overlapping lock, if overlap
1170 * overlap pointer to pointer to contain address
1171 * of overlapping lock
1172 *
1173 * Returns: OVERLAP_NONE
1174 * OVERLAP_EQUALS_LOCK
1175 * OVERLAP_CONTAINS_LOCK
1176 * OVERLAP_CONTAINED_BY_LOCK
1177 * OVERLAP_STARTS_BEFORE_LOCK
1178 * OVERLAP_ENDS_AFTER_LOCK
1179 *
1180 * Implicit Returns:
1181 * *prev The address of the next pointer in the
1182 * lock previous to the overlapping lock;
1183 * this is generally used to relink the
1184 * lock list, avoiding a second iteration.
1185 * *overlap The pointer to the overlapping lock
1186 * itself; this is used to return data in
1187 * the check == OTHERS case, and for the
1188 * caller to modify the overlapping lock,
1189 * in the check == SELF case
1190 *
1191 * Note: This returns only the FIRST overlapping lock. There may be
1192 * more than one. lf_getlock will return the first blocking lock,
1193 * while lf_setlock will iterate over all overlapping locks to
1194 *
1195 * The check parameter can be SELF, meaning we are looking for
1196 * overlapping locks owned by us, or it can be OTHERS, meaning
1197 * we are looking for overlapping locks owned by someone else so
1198 * we can report a blocking lock on an F_GETLK request.
1199 *
1200 * The value of *overlap and *prev are modified, even if there is
1201 * no overlapping lock found; always check the return code.
1202 */
1203 static overlap_t
1204 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
1205 struct lockf ***prev, struct lockf **overlap)
1206 {
1207 off_t start, end;
1208 int found_self = 0;
1209
1210 *overlap = lf;
1211 if (lf == NOLOCKF) {
1212 return 0;
1213 }
1214 #ifdef LOCKF_DEBUGGING
1215 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1216 lf_print("lf_findoverlap: looking for overlap in", lock);
1217 }
1218 #endif /* LOCKF_DEBUGGING */
1219 start = lock->lf_start;
1220 end = lock->lf_end;
1221 while (lf != NOLOCKF) {
1222 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
1223 ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
1224 /*
1225 * Locks belonging to one process are adjacent on the
1226 * list, so if we've found any locks belonging to us,
1227 * and we're now seeing something else, then we've
1228 * examined all "self" locks. Note that bailing out
1229 * here is quite important; for coalescing, we assume
1230 * numerically adjacent locks from the same owner to
1231 * be adjacent on the list.
1232 */
1233 if ((type & SELF) && found_self) {
1234 return OVERLAP_NONE;
1235 }
1236
1237 *prev = &lf->lf_next;
1238 *overlap = lf = lf->lf_next;
1239 continue;
1240 }
1241
1242 if ((type & SELF)) {
1243 found_self = 1;
1244 }
1245
1246 #ifdef LOCKF_DEBUGGING
1247 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1248 lf_print("\tchecking", lf);
1249 }
1250 #endif /* LOCKF_DEBUGGING */
1251 /*
1252 * OK, check for overlap
1253 */
1254 if ((lf->lf_end != -1 && start > lf->lf_end) ||
1255 (end != -1 && lf->lf_start > end)) {
1256 /* Case 0 */
1257 LOCKF_DEBUG(LF_DBG_LIST, "no overlap\n");
1258
1259 /*
1260 * NOTE: assumes that locks for the same process are
1261 * nonintersecting and ordered.
1262 */
1263 if ((type & SELF) && end != -1 && lf->lf_start > end) {
1264 return OVERLAP_NONE;
1265 }
1266 *prev = &lf->lf_next;
1267 *overlap = lf = lf->lf_next;
1268 continue;
1269 }
1270 if ((lf->lf_start == start) && (lf->lf_end == end)) {
1271 LOCKF_DEBUG(LF_DBG_LIST, "overlap == lock\n");
1272 return OVERLAP_EQUALS_LOCK;
1273 }
1274 if ((lf->lf_start <= start) &&
1275 (end != -1) &&
1276 ((lf->lf_end >= end) || (lf->lf_end == -1))) {
1277 LOCKF_DEBUG(LF_DBG_LIST, "overlap contains lock\n");
1278 return OVERLAP_CONTAINS_LOCK;
1279 }
1280 if (start <= lf->lf_start &&
1281 (end == -1 ||
1282 (lf->lf_end != -1 && end >= lf->lf_end))) {
1283 LOCKF_DEBUG(LF_DBG_LIST, "lock contains overlap\n");
1284 return OVERLAP_CONTAINED_BY_LOCK;
1285 }
1286 if ((lf->lf_start < start) &&
1287 ((lf->lf_end >= start) || (lf->lf_end == -1))) {
1288 LOCKF_DEBUG(LF_DBG_LIST, "overlap starts before lock\n");
1289 return OVERLAP_STARTS_BEFORE_LOCK;
1290 }
1291 if ((lf->lf_start > start) &&
1292 (end != -1) &&
1293 ((lf->lf_end > end) || (lf->lf_end == -1))) {
1294 LOCKF_DEBUG(LF_DBG_LIST, "overlap ends after lock\n");
1295 return OVERLAP_ENDS_AFTER_LOCK;
1296 }
1297 panic("lf_findoverlap: default");
1298 }
1299 return OVERLAP_NONE;
1300 }
1301
1302
1303 /*
1304 * lf_split
1305 *
1306 * Description: Split a lock and a contained region into two or three locks
1307 * as necessary.
1308 *
1309 * Parameters: lock1 Lock to split
1310 * lock2 Overlapping lock region requiring the
1311 * split (upgrade/downgrade/unlock)
1312 *
1313 * Returns: 0 Success
1314 * ENOLCK No memory for new lock
1315 *
1316 * Implicit Returns:
1317 * *lock1 Modified original lock
1318 * *lock2 Overlapping lock (inserted into list)
1319 * (new lock) Potential new lock inserted into list
1320 * if split results in 3 locks
1321 *
1322 * Notes: This operation can only fail if the split would result in three
1323 * locks, and there is insufficient memory to allocate the third
1324 * lock; in that case, neither of the locks will be modified.
1325 */
1326 static int
1327 lf_split(struct lockf *lock1, struct lockf *lock2)
1328 {
1329 struct lockf *splitlock;
1330
1331 #ifdef LOCKF_DEBUGGING
1332 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1333 lf_print("lf_split", lock1);
1334 lf_print("splitting from", lock2);
1335 }
1336 #endif /* LOCKF_DEBUGGING */
1337 /*
1338 * Check to see if splitting into only two pieces.
1339 */
1340 if (lock1->lf_start == lock2->lf_start) {
1341 lock1->lf_start = lock2->lf_end + 1;
1342 lock2->lf_next = lock1;
1343 return 0;
1344 }
1345 if (lock1->lf_end == lock2->lf_end) {
1346 lock1->lf_end = lock2->lf_start - 1;
1347 lock2->lf_next = lock1->lf_next;
1348 lock1->lf_next = lock2;
1349 return 0;
1350 }
1351 /*
1352 * Make a new lock consisting of the last part of
1353 * the encompassing lock
1354 */
1355 MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
1356 if (splitlock == NULL) {
1357 return ENOLCK;
1358 }
1359 bcopy(lock1, splitlock, sizeof *splitlock);
1360 splitlock->lf_start = lock2->lf_end + 1;
1361 TAILQ_INIT(&splitlock->lf_blkhd);
1362 lock1->lf_end = lock2->lf_start - 1;
1363 /*
1364 * OK, now link it in
1365 */
1366 splitlock->lf_next = lock1->lf_next;
1367 lock2->lf_next = splitlock;
1368 lock1->lf_next = lock2;
1369
1370 return 0;
1371 }
1372
1373
1374 /*
1375 * lf_wakelock
1376 *
1377 * Wakeup a blocklist in the case of a downgrade or unlock, since others
1378 * waiting on the lock may now be able to acquire it.
1379 *
1380 * Parameters: listhead Lock list head on which waiters may
1381 * have pending locks
1382 *
1383 * Returns: <void>
1384 *
1385 * Notes: This function iterates a list of locks and wakes all waiters,
1386 * rather than only waiters for the contended regions. Because
1387 * of this, for heavily contended files, this can result in a
1388 * "thundering herd" situation. Refactoring the code could make
1389 * this operation more efficient, if heavy contention ever results
1390 * in a real-world performance problem.
1391 */
1392 static void
1393 lf_wakelock(struct lockf *listhead, boolean_t force_all)
1394 {
1395 struct lockf *wakelock;
1396 boolean_t wake_all = TRUE;
1397
1398 if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE)) {
1399 wake_all = FALSE;
1400 }
1401
1402 while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1403 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1404 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1405
1406 wakelock->lf_next = NOLOCKF;
1407 #ifdef LOCKF_DEBUGGING
1408 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1409 lf_print("lf_wakelock: awakening", wakelock);
1410 }
1411 #endif /* LOCKF_DEBUGGING */
1412 if (wake_all == FALSE) {
1413 /*
1414 * If there are items on the list head block list,
1415 * move them to the wakelock list instead, and then
1416 * correct their lf_next pointers.
1417 */
1418 if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1419 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1420
1421 struct lockf *tlock;
1422
1423 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1424 if (TAILQ_NEXT(tlock, lf_block) == tlock) {
1425 /* See rdar://10887303 */
1426 panic("cycle in wakelock list");
1427 }
1428 tlock->lf_next = wakelock;
1429 }
1430 }
1431 }
1432 wakeup(wakelock);
1433
1434 if (wake_all == FALSE) {
1435 break;
1436 }
1437 }
1438 }
1439
1440
1441 #ifdef LOCKF_DEBUGGING
1442 #define GET_LF_OWNER_PID(lf) (proc_pid((lf)->lf_owner))
1443
1444 /*
1445 * lf_print DEBUG
1446 *
1447 * Print out a lock; lock information is prefixed by the string in 'tag'
1448 *
1449 * Parameters: tag A string tag for debugging
1450 * lock The lock whose information should be
1451 * displayed
1452 *
1453 * Returns: <void>
1454 */
1455 void
1456 lf_print(const char *tag, struct lockf *lock)
1457 {
1458 printf("%s: lock %p for ", tag, (void *)lock);
1459 if (lock->lf_flags & F_POSIX) {
1460 printf("proc %p (owner %d)",
1461 lock->lf_id, GET_LF_OWNER_PID(lock));
1462 } else if (lock->lf_flags & F_OFD_LOCK) {
1463 printf("fg %p (owner %d)",
1464 lock->lf_id, GET_LF_OWNER_PID(lock));
1465 } else {
1466 printf("id %p", (void *)lock->lf_id);
1467 }
1468 if (lock->lf_vnode != 0) {
1469 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1470 lock->lf_vnode,
1471 lock->lf_type == F_RDLCK ? "shared" :
1472 lock->lf_type == F_WRLCK ? "exclusive" :
1473 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1474 (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1475 } else {
1476 printf(" %s, start 0x%016llx, end 0x%016llx",
1477 lock->lf_type == F_RDLCK ? "shared" :
1478 lock->lf_type == F_WRLCK ? "exclusive" :
1479 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1480 (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1481 }
1482 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
1483 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1484 } else {
1485 printf("\n");
1486 }
1487 }
1488
1489
1490 /*
1491 * lf_printlist DEBUG
1492 *
1493 * Print out a lock list for the vnode associated with 'lock'; lock information
1494 * is prefixed by the string in 'tag'
1495 *
1496 * Parameters: tag A string tag for debugging
1497 * lock The lock whose vnode's lock list should
1498 * be displayed
1499 *
1500 * Returns: <void>
1501 */
1502 void
1503 lf_printlist(const char *tag, struct lockf *lock)
1504 {
1505 struct lockf *lf, *blk;
1506
1507 if (lock->lf_vnode == 0) {
1508 return;
1509 }
1510
1511 printf("%s: Lock list for vno %p:\n",
1512 tag, lock->lf_vnode);
1513 for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1514 printf("\tlock %p for ", (void *)lf);
1515 if (lf->lf_flags & F_POSIX) {
1516 printf("proc %p (owner %d)",
1517 lf->lf_id, GET_LF_OWNER_PID(lf));
1518 } else if (lf->lf_flags & F_OFD_LOCK) {
1519 printf("fg %p (owner %d)",
1520 lf->lf_id, GET_LF_OWNER_PID(lf));
1521 } else {
1522 printf("id %p", (void *)lf->lf_id);
1523 }
1524 printf(", %s, start 0x%016llx, end 0x%016llx",
1525 lf->lf_type == F_RDLCK ? "shared" :
1526 lf->lf_type == F_WRLCK ? "exclusive" :
1527 lf->lf_type == F_UNLCK ? "unlock" :
1528 "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
1529 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1530 printf("\n\t\tlock request %p for ", (void *)blk);
1531 if (blk->lf_flags & F_POSIX) {
1532 printf("proc %p (owner %d)",
1533 blk->lf_id, GET_LF_OWNER_PID(blk));
1534 } else if (blk->lf_flags & F_OFD_LOCK) {
1535 printf("fg %p (owner %d)",
1536 blk->lf_id, GET_LF_OWNER_PID(blk));
1537 } else {
1538 printf("id %p", (void *)blk->lf_id);
1539 }
1540 printf(", %s, start 0x%016llx, end 0x%016llx",
1541 blk->lf_type == F_RDLCK ? "shared" :
1542 blk->lf_type == F_WRLCK ? "exclusive" :
1543 blk->lf_type == F_UNLCK ? "unlock" :
1544 "unknown", (intmax_t)blk->lf_start,
1545 (intmax_t)blk->lf_end);
1546 if (!TAILQ_EMPTY(&blk->lf_blkhd)) {
1547 panic("lf_printlist: bad list");
1548 }
1549 }
1550 printf("\n");
1551 }
1552 }
1553 #endif /* LOCKF_DEBUGGING */
1554
1555 #if IMPORTANCE_INHERITANCE
1556
1557 /*
1558 * lf_hold_assertion
1559 *
1560 * Call task importance hold assertion on the owner of the lock.
1561 *
1562 * Parameters: block_task Owner of the lock blocking
1563 * current thread.
1564 *
1565 * block lock on which the current thread
1566 * is blocking on.
1567 *
1568 * Returns: <void>
1569 *
1570 * Notes: The task reference on block_task is not needed to be hold since
1571 * the current thread has vnode lock and block_task has a file
1572 * lock, thus removing file lock in exit requires block_task to
1573 * grab the vnode lock.
1574 */
1575 static void
1576 lf_hold_assertion(task_t block_task, struct lockf *block)
1577 {
1578 if (task_importance_hold_file_lock_assertion(block_task, 1) == 0) {
1579 block->lf_boosted = LF_BOOSTED;
1580 LOCKF_DEBUG(LF_DBG_IMPINH,
1581 "lf: importance hold file lock assert on pid %d lock %p\n",
1582 proc_pid(block->lf_owner), block);
1583 }
1584 }
1585
1586
1587 /*
1588 * lf_jump_to_queue_head
1589 *
1590 * Jump the lock from the tail of the block queue to the head of
1591 * the queue.
1592 *
1593 * Parameters: block lockf struct containing the
1594 * block queue.
1595 * lock lockf struct to be jumped to the
1596 * front.
1597 *
1598 * Returns: <void>
1599 */
1600 static void
1601 lf_jump_to_queue_head(struct lockf *block, struct lockf *lock)
1602 {
1603 /* Move the lock to the head of the block queue. */
1604 TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
1605 TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
1606 }
1607
1608
1609 /*
1610 * lf_drop_assertion
1611 *
1612 * Drops the task hold assertion.
1613 *
1614 * Parameters: block lockf struct holding the assertion.
1615 *
1616 * Returns: <void>
1617 */
1618 static void
1619 lf_drop_assertion(struct lockf *block)
1620 {
1621 LOCKF_DEBUG(LF_DBG_IMPINH, "lf: %d: dropping assertion for lock %p\n",
1622 proc_pid(block->lf_owner), block);
1623
1624 task_t current_task = proc_task(block->lf_owner);
1625 task_importance_drop_file_lock_assertion(current_task, 1);
1626 block->lf_boosted = LF_NOT_BOOSTED;
1627 }
1628
1629 /*
1630 * lf_adjust_assertion
1631 *
1632 * Adjusts importance assertion of file lock. Goes through
1633 * all the blocking locks and checks if the file lock needs
1634 * to be boosted anymore.
1635 *
1636 * Parameters: block lockf structure which needs to be adjusted.
1637 *
1638 * Returns: <void>
1639 */
1640 static void
1641 lf_adjust_assertion(struct lockf *block)
1642 {
1643 boolean_t drop_boost = TRUE;
1644 struct lockf *next;
1645
1646 /* Return if the lock is not boosted */
1647 if (block->lf_boosted == LF_NOT_BOOSTED) {
1648 return;
1649 }
1650
1651 TAILQ_FOREACH(next, &block->lf_blkhd, lf_block) {
1652 /* Check if block and next are same type of locks */
1653 if (((block->lf_flags & next->lf_flags & F_POSIX) != 0) ||
1654 ((block->lf_flags & next->lf_flags & F_OFD_LOCK) &&
1655 (block->lf_owner != next->lf_owner) &&
1656 (NULL != block->lf_owner && NULL != next->lf_owner))) {
1657 /* Check if next would be boosting block */
1658 if (task_is_importance_donor(proc_task(next->lf_owner)) &&
1659 task_is_importance_receiver_type(proc_task(block->lf_owner))) {
1660 /* Found a lock boosting block */
1661 drop_boost = FALSE;
1662 break;
1663 }
1664 }
1665 }
1666
1667 if (drop_boost) {
1668 lf_drop_assertion(block);
1669 }
1670 }
1671
1672 static void
1673 lf_boost_blocking_proc(struct lockf *lock, struct lockf *block)
1674 {
1675 task_t ltask = proc_task(lock->lf_owner);
1676 task_t btask = proc_task(block->lf_owner);
1677
1678 /*
1679 * Check if ltask can donate importance. The
1680 * check of imp_donor bit is done without holding
1681 * any lock. The value may change after you read it,
1682 * but it is ok to boost a task while someone else is
1683 * unboosting you.
1684 *
1685 * TODO: Support live inheritance on file locks.
1686 */
1687 if (task_is_importance_donor(ltask)) {
1688 LOCKF_DEBUG(LF_DBG_IMPINH,
1689 "lf: %d: attempt to boost pid %d that holds lock %p\n",
1690 proc_pid(lock->lf_owner), proc_pid(block->lf_owner), block);
1691
1692 if (block->lf_boosted != LF_BOOSTED &&
1693 task_is_importance_receiver_type(btask)) {
1694 lf_hold_assertion(btask, block);
1695 }
1696 lf_jump_to_queue_head(block, lock);
1697 }
1698 }
1699 #endif /* IMPORTANCE_INHERITANCE */