]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_lockf.c
xnu-3247.10.11.tar.gz
[apple/xnu.git] / bsd / kern / kern_lockf.c
CommitLineData
2d21ac55 1/*
3e170ce0 2 * Copyright (c) 2015 Apple Computer, Inc. All rights reserved.
2d21ac55
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
1c79356b
A
28/*
29 * Copyright (c) 1982, 1986, 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * This code is derived from software contributed to Berkeley by
33 * Scooter Morris at Genentech Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
1c79356b
A
43 * 4. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
91447636 59 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
1c79356b
A
60 */
61
91447636 62#include <sys/cdefs.h>
1c79356b
A
63#include <sys/param.h>
64#include <sys/systm.h>
65#include <sys/kernel.h>
91447636
A
66#include <sys/lock.h>
67#include <sys/mount.h>
1c79356b 68#include <sys/proc.h>
2d21ac55 69#include <sys/signalvar.h>
91447636 70#include <sys/unistd.h>
2d21ac55 71#include <sys/user.h>
1c79356b 72#include <sys/vnode.h>
91447636
A
73#include <sys/vnode_internal.h>
74#include <sys/vnode_if.h>
1c79356b
A
75#include <sys/malloc.h>
76#include <sys/fcntl.h>
91447636 77#include <sys/lockf.h>
39236c6e
A
78#include <sys/sdt.h>
79#include <kern/task.h>
1c79356b 80
3e170ce0
A
81#include <sys/file_internal.h>
82
1c79356b
A
83/*
84 * This variable controls the maximum number of processes that will
85 * be checked in doing deadlock detection.
86 */
91447636 87static int maxlockdepth = MAXDEPTH;
1c79356b 88
3e170ce0
A
89#if (DEVELOPMENT || DEBUG)
90#define LOCKF_DEBUGGING 1
91#endif
92
2d21ac55 93#ifdef LOCKF_DEBUGGING
1c79356b 94#include <sys/sysctl.h>
2d21ac55
A
95void lf_print(const char *tag, struct lockf *lock);
96void lf_printlist(const char *tag, struct lockf *lock);
3e170ce0
A
97
98#define LF_DBG_LOCKOP (1 << 0) /* setlk, getlk, clearlk */
99#define LF_DBG_LIST (1 << 1) /* split, coalesce */
100#define LF_DBG_IMPINH (1 << 2) /* importance inheritance */
101#define LF_DBG_TRACE (1 << 3) /* errors, exit */
102
103static int lockf_debug = 0; /* was 2, could be 3 ;-) */
6d2010ae 104SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
2d21ac55
A
105
106/*
3e170ce0 107 * If there is no mask bit selector, or there is one, and the selector is
2d21ac55
A
108 * set, then output the debugging diagnostic.
109 */
110#define LOCKF_DEBUG(mask, ...) \
111 do { \
112 if( !(mask) || ((mask) & lockf_debug)) { \
113 printf(__VA_ARGS__); \
114 } \
115 } while(0)
116#else /* !LOCKF_DEBUGGING */
117#define LOCKF_DEBUG(mask, ...) /* mask */
118#endif /* !LOCKF_DEBUGGING */
1c79356b 119
91447636
A
120MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
121
1c79356b
A
122#define NOLOCKF (struct lockf *)0
123#define SELF 0x1
124#define OTHERS 0x2
91447636 125#define OFF_MAX 0x7fffffffffffffffULL /* max off_t */
2d21ac55
A
126
127/*
128 * Overlapping lock states
129 */
130typedef enum {
131 OVERLAP_NONE = 0,
132 OVERLAP_EQUALS_LOCK,
133 OVERLAP_CONTAINS_LOCK,
134 OVERLAP_CONTAINED_BY_LOCK,
135 OVERLAP_STARTS_BEFORE_LOCK,
136 OVERLAP_ENDS_AFTER_LOCK
137} overlap_t;
138
91447636 139static int lf_clearlock(struct lockf *);
2d21ac55 140static overlap_t lf_findoverlap(struct lockf *,
91447636 141 struct lockf *, int, struct lockf ***, struct lockf **);
316670eb
A
142static struct lockf *lf_getblock(struct lockf *, pid_t);
143static int lf_getlock(struct lockf *, struct flock *, pid_t);
39236c6e 144static int lf_setlock(struct lockf *, struct timespec *);
2d21ac55 145static int lf_split(struct lockf *, struct lockf *);
c910b4d9 146static void lf_wakelock(struct lockf *, boolean_t);
39236c6e
A
147#if IMPORTANCE_INHERITANCE
148static void lf_hold_assertion(task_t, struct lockf *);
149static void lf_jump_to_queue_head(struct lockf *, struct lockf *);
150static void lf_drop_assertion(struct lockf *);
3e170ce0 151static void lf_boost_blocking_proc(struct lockf *, struct lockf *);
39236c6e 152#endif /* IMPORTANCE_INHERITANCE */
c910b4d9 153
1c79356b 154/*
2d21ac55
A
155 * lf_advlock
156 *
157 * Description: Advisory record locking support
158 *
159 * Parameters: ap Argument pointer to a vnop_advlock_args
160 * argument descriptor structure for the
161 * lock operation to be attempted.
162 *
163 * Returns: 0 Success
164 * EOVERFLOW
165 * EINVAL
166 * ENOLCK Number of locked regions exceeds limit
167 * lf_setlock:EAGAIN
168 * lf_setlock:EDEADLK
169 * lf_setlock:EINTR
170 * lf_setlock:ENOLCK
39236c6e 171 * lf_setlock:ETIMEDOUT
2d21ac55
A
172 * lf_clearlock:ENOLCK
173 * vnode_size:???
174 *
175 * Notes: We return ENOLCK when we run out of memory to support locks; as
176 * such, there is no specific expectation limit other than the
177 * amount of available resources.
1c79356b
A
178 */
179int
2d21ac55 180lf_advlock(struct vnop_advlock_args *ap)
91447636
A
181{
182 struct vnode *vp = ap->a_vp;
183 struct flock *fl = ap->a_fl;
184 vfs_context_t context = ap->a_context;
185 struct lockf *lock;
186 off_t start, end, oadd;
187 u_quad_t size;
188 int error;
189 struct lockf **head = &vp->v_lockf;
190
191 /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
192
193 /*
194 * Avoid the common case of unlocking when inode has no locks.
195 */
196 if (*head == (struct lockf *)0) {
197 if (ap->a_op != F_SETLK) {
198 fl->l_type = F_UNLCK;
3e170ce0
A
199 LOCKF_DEBUG(LF_DBG_TRACE,
200 "lf_advlock: '%s' unlock without lock\n",
201 vfs_context_proc(context)->p_comm);
91447636
A
202 return (0);
203 }
204 }
205
206 /*
207 * Convert the flock structure into a start and end.
208 */
209 switch (fl->l_whence) {
210
211 case SEEK_SET:
212 case SEEK_CUR:
213 /*
214 * Caller is responsible for adding any necessary offset
215 * when SEEK_CUR is used.
216 */
217 start = fl->l_start;
218 break;
219
220 case SEEK_END:
221
2d21ac55
A
222 /*
223 * It's OK to cast the u_quad_t to and off_t here, since they
224 * are the same storage size, and the value of the returned
225 * contents will never overflow into the sign bit. We need to
226 * do this because we will use size to force range checks.
227 */
228 if ((error = vnode_size(vp, (off_t *)&size, context))) {
3e170ce0
A
229 LOCKF_DEBUG(LF_DBG_TRACE,
230 "lf_advlock: vnode_getattr failed: %d\n", error);
91447636 231 return (error);
2d21ac55 232 }
91447636
A
233
234 if (size > OFF_MAX ||
2d21ac55
A
235 (fl->l_start > 0 &&
236 size > (u_quad_t)(OFF_MAX - fl->l_start)))
91447636
A
237 return (EOVERFLOW);
238 start = size + fl->l_start;
239 break;
240
241 default:
3e170ce0
A
242 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: unknown whence %d\n",
243 fl->l_whence);
91447636
A
244 return (EINVAL);
245 }
2d21ac55 246 if (start < 0) {
3e170ce0
A
247 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: start < 0 (%qd)\n",
248 start);
91447636 249 return (EINVAL);
2d21ac55 250 }
91447636 251 if (fl->l_len < 0) {
2d21ac55 252 if (start == 0) {
3e170ce0
A
253 LOCKF_DEBUG(LF_DBG_TRACE,
254 "lf_advlock: len < 0 & start == 0\n");
91447636 255 return (EINVAL);
2d21ac55 256 }
91447636
A
257 end = start - 1;
258 start += fl->l_len;
2d21ac55 259 if (start < 0) {
3e170ce0
A
260 LOCKF_DEBUG(LF_DBG_TRACE,
261 "lf_advlock: start < 0 (%qd)\n", start);
91447636 262 return (EINVAL);
2d21ac55 263 }
91447636
A
264 } else if (fl->l_len == 0)
265 end = -1;
266 else {
267 oadd = fl->l_len - 1;
2d21ac55 268 if (oadd > (off_t)(OFF_MAX - start)) {
3e170ce0 269 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: overflow\n");
91447636 270 return (EOVERFLOW);
2d21ac55 271 }
91447636
A
272 end = start + oadd;
273 }
274 /*
275 * Create the lockf structure
276 */
277 MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
2d21ac55
A
278 if (lock == NULL)
279 return (ENOLCK);
91447636
A
280 lock->lf_start = start;
281 lock->lf_end = end;
282 lock->lf_id = ap->a_id;
283 lock->lf_vnode = vp;
284 lock->lf_type = fl->l_type;
285 lock->lf_head = head;
286 lock->lf_next = (struct lockf *)0;
287 TAILQ_INIT(&lock->lf_blkhd);
288 lock->lf_flags = ap->a_flags;
39236c6e
A
289#if IMPORTANCE_INHERITANCE
290 lock->lf_boosted = LF_NOT_BOOSTED;
3e170ce0
A
291#endif
292 if (ap->a_flags & F_POSIX)
293 lock->lf_owner = (struct proc *)lock->lf_id;
294 else
295 lock->lf_owner = NULL;
91447636 296
c910b4d9
A
297 if (ap->a_flags & F_FLOCK)
298 lock->lf_flags |= F_WAKE1_SAFE;
299
91447636
A
300 lck_mtx_lock(&vp->v_lock); /* protect the lockf list */
301 /*
302 * Do the requested operation.
303 */
304 switch(ap->a_op) {
305 case F_SETLK:
3e170ce0
A
306 /*
307 * For F_OFD_* locks, lf_id is the fileglob.
308 * Record an "lf_owner" iff this is a confined fd
309 * i.e. it cannot escape this process and will be
310 * F_UNLCKed before the owner exits. (This is
311 * the implicit guarantee needed to ensure lf_owner
312 * remains a valid reference here.)
313 */
314 if (ap->a_flags & F_OFD_LOCK) {
315 struct fileglob *fg = (void *)lock->lf_id;
316 if (fg->fg_lflags & FG_CONFINED)
317 lock->lf_owner = current_proc();
318 }
39236c6e 319 error = lf_setlock(lock, ap->a_timeout);
91447636
A
320 break;
321
322 case F_UNLCK:
323 error = lf_clearlock(lock);
324 FREE(lock, M_LOCKF);
325 break;
326
327 case F_GETLK:
316670eb 328 error = lf_getlock(lock, fl, -1);
91447636
A
329 FREE(lock, M_LOCKF);
330 break;
331
316670eb 332
91447636
A
333 default:
334 FREE(lock, M_LOCKF);
335 error = EINVAL;
336 break;
337 }
6d2010ae 338 lck_mtx_unlock(&vp->v_lock); /* done manipulating the list */
91447636 339
3e170ce0 340 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: normal exit: %d\n", error);
91447636
A
341 return (error);
342}
343
316670eb
A
344/*
345 * Empty the queue of msleeping requests for a lock on the given vnode.
346 * Called with the vnode already locked. Used for forced unmount, where
347 * a flock(2) invoker sleeping on a blocked lock holds an iocount reference
348 * that prevents the vnode from ever being drained. Force unmounting wins.
349 */
350void
351lf_abort_advlocks(vnode_t vp)
352{
353 struct lockf *lock;
354
355 if ((lock = vp->v_lockf) == NULL)
356 return;
357
358 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
359
360 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
361 struct lockf *tlock;
362
363 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
364 /*
365 * Setting this flag should cause all
366 * currently blocked F_SETLK request to
367 * return to userland with an errno.
368 */
369 tlock->lf_flags |= F_ABORT;
370 }
371 lf_wakelock(lock, TRUE);
372 }
373}
2d21ac55 374
91447636 375/*
6d2010ae
A
376 * Take any lock attempts which are currently blocked by a given lock ("from")
377 * and mark them as blocked by a different lock ("to"). Used in the case
378 * where a byte range currently occupied by "from" is to be occupied by "to."
379 */
380static void
381lf_move_blocked(struct lockf *to, struct lockf *from)
382{
383 struct lockf *tlock;
384
385 TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
386 tlock->lf_next = to;
387 }
388
389 TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
390}
391
392/*
393 * lf_coalesce_adjacent
2d21ac55 394 *
6d2010ae 395 * Description: Helper function: when setting a lock, coalesce adjacent
2d21ac55 396 * locks. Needed because adjacent locks are not overlapping,
6d2010ae 397 * but POSIX requires that they be coalesced.
2d21ac55
A
398 *
399 * Parameters: lock The new lock which may be adjacent
6d2010ae
A
400 * to already locked regions, and which
401 * should therefore be coalesced with them
2d21ac55
A
402 *
403 * Returns: <void>
404 */
405static void
6d2010ae 406lf_coalesce_adjacent(struct lockf *lock)
2d21ac55
A
407{
408 struct lockf **lf = lock->lf_head;
409
410 while (*lf != NOLOCKF) {
6d2010ae 411 /* reject locks that obviously could not be coalesced */
2d21ac55
A
412 if ((*lf == lock) ||
413 ((*lf)->lf_id != lock->lf_id) ||
414 ((*lf)->lf_type != lock->lf_type)) {
415 lf = &(*lf)->lf_next;
416 continue;
417 }
418
6d2010ae
A
419 /*
420 * NOTE: Assumes that if two locks are adjacent on the number line
421 * and belong to the same owner, then they are adjacent on the list.
422 */
2d21ac55
A
423 if ((*lf)->lf_end != -1 &&
424 ((*lf)->lf_end + 1) == lock->lf_start) {
425 struct lockf *adjacent = *lf;
426
3e170ce0 427 LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent previous\n");
2d21ac55
A
428 lock->lf_start = (*lf)->lf_start;
429 *lf = lock;
430 lf = &(*lf)->lf_next;
6d2010ae
A
431
432 lf_move_blocked(lock, adjacent);
433
2d21ac55
A
434 FREE(adjacent, M_LOCKF);
435 continue;
436 }
6d2010ae 437 /* If the lock starts adjacent to us, we can coalesce it */
2d21ac55
A
438 if (lock->lf_end != -1 &&
439 (lock->lf_end + 1) == (*lf)->lf_start) {
440 struct lockf *adjacent = *lf;
441
3e170ce0 442 LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent following\n");
2d21ac55
A
443 lock->lf_end = (*lf)->lf_end;
444 lock->lf_next = (*lf)->lf_next;
445 lf = &lock->lf_next;
6d2010ae
A
446
447 lf_move_blocked(lock, adjacent);
448
2d21ac55
A
449 FREE(adjacent, M_LOCKF);
450 continue;
451 }
452
453 /* no matching conditions; go on to next lock */
454 lf = &(*lf)->lf_next;
455 }
456}
457
2d21ac55
A
458/*
459 * lf_setlock
460 *
461 * Description: Set a byte-range lock.
462 *
463 * Parameters: lock The lock structure describing the lock
464 * to be set; allocated by the caller, it
465 * will be linked into the lock list if
466 * the set is successful, and freed if the
467 * set is unsuccessful.
468 *
39236c6e
A
469 * timeout Timeout specified in the case of
470 * SETLKWTIMEOUT.
471 *
2d21ac55
A
472 * Returns: 0 Success
473 * EAGAIN
474 * EDEADLK
475 * lf_split:ENOLCK
476 * lf_clearlock:ENOLCK
477 * msleep:EINTR
39236c6e 478 * msleep:ETIMEDOUT
2d21ac55
A
479 *
480 * Notes: We add the lock to the provisional lock list. We do not
6d2010ae 481 * coalesce at this time; this has implications for other lock
2d21ac55 482 * requestors in the blocker search mechanism.
91447636
A
483 */
484static int
39236c6e 485lf_setlock(struct lockf *lock, struct timespec *timeout)
1c79356b 486{
91447636
A
487 struct lockf *block;
488 struct lockf **head = lock->lf_head;
1c79356b
A
489 struct lockf **prev, *overlap, *ltmp;
490 static char lockstr[] = "lockf";
2d21ac55 491 int priority, needtolink, error;
91447636 492 struct vnode *vp = lock->lf_vnode;
2d21ac55 493 overlap_t ovcase;
1c79356b 494
2d21ac55 495#ifdef LOCKF_DEBUGGING
3e170ce0 496 if (lockf_debug & LF_DBG_LOCKOP) {
1c79356b 497 lf_print("lf_setlock", lock);
2d21ac55
A
498 lf_printlist("lf_setlock(in)", lock);
499 }
500#endif /* LOCKF_DEBUGGING */
1c79356b
A
501
502 /*
503 * Set the priority
504 */
505 priority = PLOCK;
506 if (lock->lf_type == F_WRLCK)
507 priority += 4;
508 priority |= PCATCH;
509 /*
510 * Scan lock list for this file looking for locks that would block us.
511 */
316670eb 512 while ((block = lf_getblock(lock, -1))) {
1c79356b
A
513 /*
514 * Free the structure and return if nonblocking.
515 */
516 if ((lock->lf_flags & F_WAIT) == 0) {
39236c6e 517 DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
1c79356b
A
518 FREE(lock, M_LOCKF);
519 return (EAGAIN);
520 }
2d21ac55 521
1c79356b
A
522 /*
523 * We are blocked. Since flock style locks cover
524 * the whole file, there is no chance for deadlock.
3e170ce0
A
525 *
526 * OFD byte-range locks currently do NOT support
527 * deadlock detection.
528 *
529 * For POSIX byte-range locks we must check for deadlock.
1c79356b
A
530 *
531 * Deadlock detection is done by looking through the
532 * wait channels to see if there are any cycles that
533 * involve us. MAXDEPTH is set just to make sure we
534 * do not go off into neverland.
535 */
536 if ((lock->lf_flags & F_POSIX) &&
537 (block->lf_flags & F_POSIX)) {
2d21ac55
A
538 struct proc *wproc, *bproc;
539 struct uthread *ut;
91447636 540 struct lockf *waitblock;
1c79356b
A
541 int i = 0;
542
543 /* The block is waiting on something */
3e170ce0 544 wproc = block->lf_owner;
2d21ac55
A
545 proc_lock(wproc);
546 TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
547 /*
548 * While the thread is asleep (uu_wchan != 0)
549 * in this code (uu_wmesg == lockstr)
550 * and we have not exceeded the maximum cycle
551 * depth (i < maxlockdepth), then check for a
552 * cycle to see if the lock is blocked behind
553 * someone blocked behind us.
554 */
555 while (((waitblock = (struct lockf *)ut->uu_wchan) != NULL) &&
556 ut->uu_wmesg == lockstr &&
91447636 557 (i++ < maxlockdepth)) {
2d21ac55
A
558 waitblock = (struct lockf *)ut->uu_wchan;
559 /*
560 * Get the lock blocking the lock
561 * which would block us, and make
562 * certain it hasn't come unblocked
563 * (been granted, e.g. between the time
564 * we called lf_getblock, and the time
565 * we successfully acquired the
566 * proc_lock).
567 */
91447636 568 waitblock = waitblock->lf_next;
2d21ac55
A
569 if (waitblock == NULL)
570 break;
571
572 /*
573 * Make sure it's an advisory range
3e170ce0 574 * lock and not any other kind of lock;
2d21ac55
A
575 * if we mix lock types, it's our own
576 * fault.
577 */
91447636
A
578 if ((waitblock->lf_flags & F_POSIX) == 0)
579 break;
2d21ac55
A
580
581 /*
582 * If the owner of the lock that's
583 * blocking a lock that's blocking us
584 * getting the requested lock, then we
585 * would deadlock, so error out.
586 */
3e170ce0
A
587 bproc = waitblock->lf_owner;
588 if (bproc == lock->lf_owner) {
2d21ac55 589 proc_unlock(wproc);
91447636
A
590 FREE(lock, M_LOCKF);
591 return (EDEADLK);
592 }
1c79356b
A
593 }
594 }
2d21ac55 595 proc_unlock(wproc);
1c79356b 596 }
2d21ac55 597
1c79356b
A
598 /*
599 * For flock type locks, we must first remove
600 * any shared locks that we hold before we sleep
601 * waiting for an exclusive lock.
602 */
603 if ((lock->lf_flags & F_FLOCK) &&
604 lock->lf_type == F_WRLCK) {
605 lock->lf_type = F_UNLCK;
2d21ac55
A
606 if ((error = lf_clearlock(lock)) != 0) {
607 FREE(lock, M_LOCKF);
608 return (error);
609 }
1c79356b
A
610 lock->lf_type = F_WRLCK;
611 }
612 /*
613 * Add our lock to the blocked list and sleep until we're free.
614 * Remember who blocked us (for deadlock detection).
615 */
616 lock->lf_next = block;
617 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
c910b4d9
A
618
619 if ( !(lock->lf_flags & F_FLOCK))
620 block->lf_flags &= ~F_WAKE1_SAFE;
621
3e170ce0
A
622#if IMPORTANCE_INHERITANCE
623 /*
624 * Importance donation is done only for cases where the
625 * owning task can be unambiguously determined.
626 *
627 * POSIX type locks are not inherited by child processes;
628 * we maintain a 1:1 mapping between a lock and its owning
629 * process.
630 *
631 * Flock type locks are inherited across fork() and there is
632 * no 1:1 mapping in the general case. However, the fileglobs
633 * used by OFD locks *may* be confined to the process that
634 * created them, and thus have an "owner", in which case
635 * we also attempt importance donation.
636 */
637 if ((lock->lf_flags & block->lf_flags & F_POSIX) != 0)
638 lf_boost_blocking_proc(lock, block);
639 else if ((lock->lf_flags & block->lf_flags & F_OFD_LOCK) &&
640 lock->lf_owner != block->lf_owner &&
641 NULL != lock->lf_owner && NULL != block->lf_owner)
642 lf_boost_blocking_proc(lock, block);
643#endif /* IMPORTANCE_INHERITANCE */
644
2d21ac55 645#ifdef LOCKF_DEBUGGING
3e170ce0 646 if (lockf_debug & LF_DBG_LOCKOP) {
1c79356b 647 lf_print("lf_setlock: blocking on", block);
2d21ac55 648 lf_printlist("lf_setlock(block)", block);
1c79356b 649 }
2d21ac55 650#endif /* LOCKF_DEBUGGING */
39236c6e 651 DTRACE_FSINFO(advlock__wait, vnode_t, vp);
3e170ce0 652
39236c6e 653 error = msleep(lock, &vp->v_lock, priority, lockstr, timeout);
c910b4d9 654
316670eb
A
655 if (error == 0 && (lock->lf_flags & F_ABORT) != 0)
656 error = EBADF;
657
99c3a104 658 if (lock->lf_next) {
1c79356b 659 /*
99c3a104
A
660 * lf_wakelock() always sets wakelock->lf_next to
661 * NULL before a wakeup; so we've been woken early
662 * - perhaps by a debugger, signal or other event.
663 *
664 * Remove 'lock' from the block list (avoids double-add
665 * in the spurious case, which would create a cycle)
1c79356b 666 */
99c3a104
A
667 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
668 lock->lf_next = NULL;
669
670 if (error == 0) {
671 /*
672 * If this was a spurious wakeup, retry
673 */
674 printf("%s: spurious wakeup, retrying lock\n",
675 __func__);
676 continue;
91447636 677 }
99c3a104
A
678 }
679
680 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
681 if ((block = lf_getblock(lock, -1)) != NULL)
682 lf_move_blocked(block, lock);
683 }
684
685 if (error) {
c910b4d9
A
686 if (!TAILQ_EMPTY(&lock->lf_blkhd))
687 lf_wakelock(lock, TRUE);
91447636 688 FREE(lock, M_LOCKF);
39236c6e
A
689 /* Return ETIMEDOUT if timeout occoured. */
690 if (error == EWOULDBLOCK) {
691 error = ETIMEDOUT;
692 }
1c79356b 693 return (error);
99c3a104 694 }
1c79356b 695 }
99c3a104 696
1c79356b
A
697 /*
698 * No blocks!! Add the lock. Note that we will
699 * downgrade or upgrade any overlapping locks this
700 * process already owns.
701 *
702 * Skip over locks owned by other processes.
703 * Handle any locks that overlap and are owned by ourselves.
704 */
91447636
A
705 prev = head;
706 block = *head;
1c79356b
A
707 needtolink = 1;
708 for (;;) {
91447636
A
709 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
710 if (ovcase)
1c79356b
A
711 block = overlap->lf_next;
712 /*
713 * Six cases:
714 * 0) no overlap
715 * 1) overlap == lock
716 * 2) overlap contains lock
717 * 3) lock contains overlap
718 * 4) overlap starts before lock
719 * 5) overlap ends after lock
720 */
721 switch (ovcase) {
2d21ac55 722 case OVERLAP_NONE:
1c79356b
A
723 if (needtolink) {
724 *prev = lock;
725 lock->lf_next = overlap;
726 }
727 break;
728
2d21ac55 729 case OVERLAP_EQUALS_LOCK:
1c79356b
A
730 /*
731 * If downgrading lock, others may be
732 * able to acquire it.
733 */
734 if (lock->lf_type == F_RDLCK &&
735 overlap->lf_type == F_WRLCK)
c910b4d9 736 lf_wakelock(overlap, TRUE);
1c79356b
A
737 overlap->lf_type = lock->lf_type;
738 FREE(lock, M_LOCKF);
6d2010ae 739 lock = overlap; /* for lf_coalesce_adjacent() */
1c79356b
A
740 break;
741
2d21ac55 742 case OVERLAP_CONTAINS_LOCK:
1c79356b
A
743 /*
744 * Check for common starting point and different types.
745 */
746 if (overlap->lf_type == lock->lf_type) {
91447636 747 FREE(lock, M_LOCKF);
6d2010ae 748 lock = overlap; /* for lf_coalesce_adjacent() */
1c79356b
A
749 break;
750 }
751 if (overlap->lf_start == lock->lf_start) {
752 *prev = lock;
753 lock->lf_next = overlap;
754 overlap->lf_start = lock->lf_end + 1;
2d21ac55
A
755 } else {
756 /*
757 * If we can't split the lock, we can't
758 * grant it. Claim a system limit for the
759 * resource shortage.
760 */
761 if (lf_split(overlap, lock)) {
762 FREE(lock, M_LOCKF);
763 return (ENOLCK);
764 }
765 }
c910b4d9 766 lf_wakelock(overlap, TRUE);
1c79356b
A
767 break;
768
2d21ac55 769 case OVERLAP_CONTAINED_BY_LOCK:
1c79356b
A
770 /*
771 * If downgrading lock, others may be able to
772 * acquire it, otherwise take the list.
773 */
774 if (lock->lf_type == F_RDLCK &&
775 overlap->lf_type == F_WRLCK) {
c910b4d9 776 lf_wakelock(overlap, TRUE);
1c79356b 777 } else {
91447636
A
778 while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
779 ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
1c79356b
A
780 TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
781 lf_block);
782 TAILQ_INSERT_TAIL(&lock->lf_blkhd,
783 ltmp, lf_block);
91447636 784 ltmp->lf_next = lock;
1c79356b
A
785 }
786 }
787 /*
788 * Add the new lock if necessary and delete the overlap.
789 */
790 if (needtolink) {
791 *prev = lock;
792 lock->lf_next = overlap->lf_next;
793 prev = &lock->lf_next;
794 needtolink = 0;
795 } else
796 *prev = overlap->lf_next;
91447636 797 FREE(overlap, M_LOCKF);
1c79356b
A
798 continue;
799
2d21ac55 800 case OVERLAP_STARTS_BEFORE_LOCK:
1c79356b
A
801 /*
802 * Add lock after overlap on the list.
803 */
804 lock->lf_next = overlap->lf_next;
805 overlap->lf_next = lock;
806 overlap->lf_end = lock->lf_start - 1;
807 prev = &lock->lf_next;
c910b4d9 808 lf_wakelock(overlap, TRUE);
1c79356b
A
809 needtolink = 0;
810 continue;
811
2d21ac55 812 case OVERLAP_ENDS_AFTER_LOCK:
1c79356b
A
813 /*
814 * Add the new lock before overlap.
815 */
816 if (needtolink) {
817 *prev = lock;
818 lock->lf_next = overlap;
819 }
820 overlap->lf_start = lock->lf_end + 1;
c910b4d9 821 lf_wakelock(overlap, TRUE);
1c79356b
A
822 break;
823 }
824 break;
825 }
6d2010ae
A
826 /* Coalesce adjacent locks with identical attributes */
827 lf_coalesce_adjacent(lock);
2d21ac55 828#ifdef LOCKF_DEBUGGING
3e170ce0 829 if (lockf_debug & LF_DBG_LOCKOP) {
1c79356b 830 lf_print("lf_setlock: got the lock", lock);
2d21ac55 831 lf_printlist("lf_setlock(out)", lock);
1c79356b 832 }
2d21ac55 833#endif /* LOCKF_DEBUGGING */
1c79356b
A
834 return (0);
835}
836
2d21ac55 837
1c79356b 838/*
2d21ac55
A
839 * lf_clearlock
840 *
841 * Description: Remove a byte-range lock on an vnode. Generally, find the
842 * lock (or an overlap to that lock) and remove it (or shrink
843 * it), then wakeup anyone we can.
844 *
845 * Parameters: unlock The lock to clear
1c79356b 846 *
2d21ac55
A
847 * Returns: 0 Success
848 * lf_split:ENOLCK
849 *
850 * Notes: A caller may unlock all the locks owned by the caller by
851 * specifying the entire file range; locks owned by other
852 * callers are not effected by this operation.
1c79356b 853 */
91447636 854static int
2d21ac55 855lf_clearlock(struct lockf *unlock)
1c79356b 856{
91447636
A
857 struct lockf **head = unlock->lf_head;
858 struct lockf *lf = *head;
1c79356b 859 struct lockf *overlap, **prev;
2d21ac55 860 overlap_t ovcase;
1c79356b
A
861
862 if (lf == NOLOCKF)
863 return (0);
2d21ac55 864#ifdef LOCKF_DEBUGGING
1c79356b
A
865 if (unlock->lf_type != F_UNLCK)
866 panic("lf_clearlock: bad type");
3e170ce0 867 if (lockf_debug & LF_DBG_LOCKOP)
1c79356b 868 lf_print("lf_clearlock", unlock);
2d21ac55 869#endif /* LOCKF_DEBUGGING */
91447636 870 prev = head;
2d21ac55 871 while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
1c79356b
A
872 /*
873 * Wakeup the list of locks to be retried.
874 */
c910b4d9 875 lf_wakelock(overlap, FALSE);
39236c6e
A
876#if IMPORTANCE_INHERITANCE
877 if (overlap->lf_boosted == LF_BOOSTED) {
878 lf_drop_assertion(overlap);
879 }
880#endif /* IMPORTANCE_INHERITANCE */
1c79356b
A
881
882 switch (ovcase) {
2d21ac55
A
883 case OVERLAP_NONE: /* satisfy compiler enum/switch */
884 break;
1c79356b 885
2d21ac55 886 case OVERLAP_EQUALS_LOCK:
1c79356b
A
887 *prev = overlap->lf_next;
888 FREE(overlap, M_LOCKF);
889 break;
890
2d21ac55 891 case OVERLAP_CONTAINS_LOCK: /* split it */
1c79356b
A
892 if (overlap->lf_start == unlock->lf_start) {
893 overlap->lf_start = unlock->lf_end + 1;
894 break;
895 }
2d21ac55
A
896 /*
897 * If we can't split the lock, we can't grant it.
898 * Claim a system limit for the resource shortage.
899 */
900 if (lf_split(overlap, unlock))
901 return (ENOLCK);
1c79356b
A
902 overlap->lf_next = unlock->lf_next;
903 break;
904
2d21ac55 905 case OVERLAP_CONTAINED_BY_LOCK:
1c79356b
A
906 *prev = overlap->lf_next;
907 lf = overlap->lf_next;
91447636 908 FREE(overlap, M_LOCKF);
1c79356b
A
909 continue;
910
2d21ac55 911 case OVERLAP_STARTS_BEFORE_LOCK:
1c79356b
A
912 overlap->lf_end = unlock->lf_start - 1;
913 prev = &overlap->lf_next;
914 lf = overlap->lf_next;
915 continue;
916
2d21ac55 917 case OVERLAP_ENDS_AFTER_LOCK:
1c79356b
A
918 overlap->lf_start = unlock->lf_end + 1;
919 break;
920 }
921 break;
922 }
2d21ac55 923#ifdef LOCKF_DEBUGGING
3e170ce0 924 if (lockf_debug & LF_DBG_LOCKOP)
1c79356b 925 lf_printlist("lf_clearlock", unlock);
2d21ac55 926#endif /* LOCKF_DEBUGGING */
1c79356b
A
927 return (0);
928}
929
2d21ac55 930
1c79356b 931/*
2d21ac55
A
932 * lf_getlock
933 *
934 * Description: Check whether there is a blocking lock, and if so return
935 * its process identifier into the lock being requested.
936 *
937 * Parameters: lock Pointer to lock to test for blocks
938 * fl Pointer to flock structure to receive
939 * the blocking lock information, if a
940 * blocking lock is found.
316670eb 941 * matchpid -1, or pid value to match in lookup.
2d21ac55
A
942 *
943 * Returns: 0 Success
944 *
945 * Implicit Returns:
946 * *fl Contents modified to reflect the
947 * blocking lock, if one is found; not
948 * modified otherwise
949 *
950 * Notes: fl->l_pid will be (-1) for file locks and will only be set to
951 * the blocking process ID for advisory record locks.
1c79356b 952 */
91447636 953static int
316670eb 954lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
1c79356b 955{
91447636 956 struct lockf *block;
1c79356b 957
2d21ac55 958#ifdef LOCKF_DEBUGGING
3e170ce0 959 if (lockf_debug & LF_DBG_LOCKOP)
1c79356b 960 lf_print("lf_getlock", lock);
2d21ac55 961#endif /* LOCKF_DEBUGGING */
1c79356b 962
316670eb 963 if ((block = lf_getblock(lock, matchpid))) {
1c79356b
A
964 fl->l_type = block->lf_type;
965 fl->l_whence = SEEK_SET;
966 fl->l_start = block->lf_start;
967 if (block->lf_end == -1)
968 fl->l_len = 0;
969 else
970 fl->l_len = block->lf_end - block->lf_start + 1;
3e170ce0
A
971 if (NULL != block->lf_owner) {
972 /*
973 * lf_owner is only non-NULL when the lock
974 * "owner" can be unambiguously determined
975 */
976 fl->l_pid = proc_pid(block->lf_owner);
977 } else
1c79356b
A
978 fl->l_pid = -1;
979 } else {
980 fl->l_type = F_UNLCK;
981 }
982 return (0);
983}
984
985/*
2d21ac55
A
986 * lf_getblock
987 *
988 * Description: Walk the list of locks for an inode and return the first
989 * blocking lock. A lock is considered blocking if we are not
990 * the lock owner; otherwise, we are permitted to upgrade or
991 * downgrade it, and it's not considered blocking.
992 *
993 * Parameters: lock The lock for which we are interested
994 * in obtaining the blocking lock, if any
316670eb 995 * matchpid -1, or pid value to match in lookup.
2d21ac55
A
996 *
997 * Returns: NOLOCKF No blocking lock exists
998 * !NOLOCKF The address of the blocking lock's
999 * struct lockf.
1c79356b 1000 */
91447636 1001static struct lockf *
316670eb 1002lf_getblock(struct lockf *lock, pid_t matchpid)
1c79356b 1003{
91447636 1004 struct lockf **prev, *overlap, *lf = *(lock->lf_head);
1c79356b 1005
316670eb
A
1006 for (prev = lock->lf_head;
1007 lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
1008 lf = overlap->lf_next) {
1c79356b 1009 /*
316670eb
A
1010 * Found an overlap.
1011 *
1012 * If we're matching pids, and it's a record lock,
3e170ce0 1013 * or it's an OFD lock on a process-confined fd,
316670eb 1014 * but the pid doesn't match, then keep on looking ..
1c79356b 1015 */
316670eb 1016 if (matchpid != -1 &&
3e170ce0
A
1017 (overlap->lf_flags & (F_POSIX|F_OFD_LOCK)) != 0 &&
1018 proc_pid(overlap->lf_owner) != matchpid)
316670eb 1019 continue;
3e170ce0 1020
1c79356b 1021 /*
316670eb 1022 * does it block us?
1c79356b 1023 */
316670eb
A
1024 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
1025 return (overlap);
1c79356b
A
1026 }
1027 return (NOLOCKF);
1028}
1029
2d21ac55 1030
1c79356b 1031/*
2d21ac55
A
1032 * lf_findoverlap
1033 *
1034 * Description: Walk the list of locks to find an overlapping lock (if any).
1035 *
1036 * Parameters: lf First lock on lock list
1037 * lock The lock we are checking for an overlap
1038 * check Check type
1039 * prev pointer to pointer pointer to contain
1040 * address of pointer to previous lock
1041 * pointer to overlapping lock, if overlap
1042 * overlap pointer to pointer to contain address
1043 * of overlapping lock
1044 *
1045 * Returns: OVERLAP_NONE
1046 * OVERLAP_EQUALS_LOCK
1047 * OVERLAP_CONTAINS_LOCK
1048 * OVERLAP_CONTAINED_BY_LOCK
1049 * OVERLAP_STARTS_BEFORE_LOCK
1050 * OVERLAP_ENDS_AFTER_LOCK
1c79356b 1051 *
2d21ac55
A
1052 * Implicit Returns:
1053 * *prev The address of the next pointer in the
1054 * lock previous to the overlapping lock;
1055 * this is generally used to relink the
1056 * lock list, avoiding a second iteration.
1057 * *overlap The pointer to the overlapping lock
316670eb 1058 * itself; this is used to return data in
2d21ac55
A
1059 * the check == OTHERS case, and for the
1060 * caller to modify the overlapping lock,
1061 * in the check == SELF case
1062 *
1063 * Note: This returns only the FIRST overlapping lock. There may be
1064 * more than one. lf_getlock will return the first blocking lock,
1065 * while lf_setlock will iterate over all overlapping locks to
1066 *
1067 * The check parameter can be SELF, meaning we are looking for
6d2010ae 1068 * overlapping locks owned by us, or it can be OTHERS, meaning
2d21ac55
A
1069 * we are looking for overlapping locks owned by someone else so
1070 * we can report a blocking lock on an F_GETLK request.
1071 *
1072 * The value of *overlap and *prev are modified, even if there is
1073 * no overlapping lock found; always check the return code.
1c79356b 1074 */
2d21ac55
A
1075static overlap_t
1076lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
1077 struct lockf ***prev, struct lockf **overlap)
1c79356b
A
1078{
1079 off_t start, end;
6d2010ae 1080 int found_self = 0;
1c79356b
A
1081
1082 *overlap = lf;
1083 if (lf == NOLOCKF)
1084 return (0);
2d21ac55 1085#ifdef LOCKF_DEBUGGING
3e170ce0 1086 if (lockf_debug & LF_DBG_LIST)
1c79356b 1087 lf_print("lf_findoverlap: looking for overlap in", lock);
2d21ac55 1088#endif /* LOCKF_DEBUGGING */
1c79356b
A
1089 start = lock->lf_start;
1090 end = lock->lf_end;
1091 while (lf != NOLOCKF) {
1092 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
1093 ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
6d2010ae
A
1094 /*
1095 * Locks belonging to one process are adjacent on the
1096 * list, so if we've found any locks belonging to us,
1097 * and we're now seeing something else, then we've
1098 * examined all "self" locks. Note that bailing out
1099 * here is quite important; for coalescing, we assume
1100 * numerically adjacent locks from the same owner to
1101 * be adjacent on the list.
1102 */
1103 if ((type & SELF) && found_self) {
1104 return OVERLAP_NONE;
1105 }
1106
1c79356b
A
1107 *prev = &lf->lf_next;
1108 *overlap = lf = lf->lf_next;
1109 continue;
1110 }
6d2010ae
A
1111
1112 if ((type & SELF)) {
1113 found_self = 1;
1114 }
1115
2d21ac55 1116#ifdef LOCKF_DEBUGGING
3e170ce0 1117 if (lockf_debug & LF_DBG_LIST)
1c79356b 1118 lf_print("\tchecking", lf);
2d21ac55 1119#endif /* LOCKF_DEBUGGING */
1c79356b
A
1120 /*
1121 * OK, check for overlap
1c79356b
A
1122 */
1123 if ((lf->lf_end != -1 && start > lf->lf_end) ||
1124 (end != -1 && lf->lf_start > end)) {
1125 /* Case 0 */
3e170ce0 1126 LOCKF_DEBUG(LF_DBG_LIST, "no overlap\n");
6d2010ae
A
1127
1128 /*
1129 * NOTE: assumes that locks for the same process are
1130 * nonintersecting and ordered.
1131 */
1c79356b 1132 if ((type & SELF) && end != -1 && lf->lf_start > end)
2d21ac55 1133 return (OVERLAP_NONE);
1c79356b
A
1134 *prev = &lf->lf_next;
1135 *overlap = lf = lf->lf_next;
1136 continue;
1137 }
1138 if ((lf->lf_start == start) && (lf->lf_end == end)) {
3e170ce0 1139 LOCKF_DEBUG(LF_DBG_LIST, "overlap == lock\n");
2d21ac55 1140 return (OVERLAP_EQUALS_LOCK);
1c79356b
A
1141 }
1142 if ((lf->lf_start <= start) &&
1143 (end != -1) &&
1144 ((lf->lf_end >= end) || (lf->lf_end == -1))) {
3e170ce0 1145 LOCKF_DEBUG(LF_DBG_LIST, "overlap contains lock\n");
2d21ac55 1146 return (OVERLAP_CONTAINS_LOCK);
1c79356b
A
1147 }
1148 if (start <= lf->lf_start &&
1149 (end == -1 ||
1150 (lf->lf_end != -1 && end >= lf->lf_end))) {
3e170ce0 1151 LOCKF_DEBUG(LF_DBG_LIST, "lock contains overlap\n");
2d21ac55 1152 return (OVERLAP_CONTAINED_BY_LOCK);
1c79356b
A
1153 }
1154 if ((lf->lf_start < start) &&
1155 ((lf->lf_end >= start) || (lf->lf_end == -1))) {
3e170ce0 1156 LOCKF_DEBUG(LF_DBG_LIST, "overlap starts before lock\n");
2d21ac55 1157 return (OVERLAP_STARTS_BEFORE_LOCK);
1c79356b
A
1158 }
1159 if ((lf->lf_start > start) &&
1160 (end != -1) &&
1161 ((lf->lf_end > end) || (lf->lf_end == -1))) {
3e170ce0 1162 LOCKF_DEBUG(LF_DBG_LIST, "overlap ends after lock\n");
2d21ac55 1163 return (OVERLAP_ENDS_AFTER_LOCK);
1c79356b
A
1164 }
1165 panic("lf_findoverlap: default");
1166 }
2d21ac55 1167 return (OVERLAP_NONE);
1c79356b
A
1168}
1169
2d21ac55 1170
1c79356b 1171/*
2d21ac55
A
1172 * lf_split
1173 *
1174 * Description: Split a lock and a contained region into two or three locks
1175 * as necessary.
1176 *
1177 * Parameters: lock1 Lock to split
1178 * lock2 Overlapping lock region requiring the
1179 * split (upgrade/downgrade/unlock)
1180 *
1181 * Returns: 0 Success
1182 * ENOLCK No memory for new lock
1183 *
1184 * Implicit Returns:
1185 * *lock1 Modified original lock
1186 * *lock2 Overlapping lock (inserted into list)
1187 * (new lock) Potential new lock inserted into list
1188 * if split results in 3 locks
1189 *
1190 * Notes: This operation can only fail if the split would result in three
1191 * locks, and there is insufficient memory to allocate the third
1192 * lock; in that case, neither of the locks will be modified.
1c79356b 1193 */
2d21ac55
A
1194static int
1195lf_split(struct lockf *lock1, struct lockf *lock2)
1c79356b 1196{
91447636 1197 struct lockf *splitlock;
1c79356b 1198
2d21ac55 1199#ifdef LOCKF_DEBUGGING
3e170ce0 1200 if (lockf_debug & LF_DBG_LIST) {
1c79356b
A
1201 lf_print("lf_split", lock1);
1202 lf_print("splitting from", lock2);
1203 }
2d21ac55 1204#endif /* LOCKF_DEBUGGING */
1c79356b 1205 /*
3e170ce0 1206 * Check to see if splitting into only two pieces.
1c79356b
A
1207 */
1208 if (lock1->lf_start == lock2->lf_start) {
1209 lock1->lf_start = lock2->lf_end + 1;
1210 lock2->lf_next = lock1;
2d21ac55 1211 return (0);
1c79356b
A
1212 }
1213 if (lock1->lf_end == lock2->lf_end) {
1214 lock1->lf_end = lock2->lf_start - 1;
1215 lock2->lf_next = lock1->lf_next;
1216 lock1->lf_next = lock2;
2d21ac55 1217 return (0);
1c79356b
A
1218 }
1219 /*
1220 * Make a new lock consisting of the last part of
1221 * the encompassing lock
1222 */
1223 MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
2d21ac55
A
1224 if (splitlock == NULL)
1225 return (ENOLCK);
91447636 1226 bcopy(lock1, splitlock, sizeof *splitlock);
1c79356b
A
1227 splitlock->lf_start = lock2->lf_end + 1;
1228 TAILQ_INIT(&splitlock->lf_blkhd);
1229 lock1->lf_end = lock2->lf_start - 1;
1230 /*
1231 * OK, now link it in
1232 */
1233 splitlock->lf_next = lock1->lf_next;
1234 lock2->lf_next = splitlock;
1235 lock1->lf_next = lock2;
2d21ac55
A
1236
1237 return (0);
1c79356b
A
1238}
1239
2d21ac55 1240
1c79356b 1241/*
2d21ac55
A
1242 * lf_wakelock
1243 *
1244 * Wakeup a blocklist in the case of a downgrade or unlock, since others
1245 * waiting on the lock may now be able to acquire it.
1246 *
1247 * Parameters: listhead Lock list head on which waiters may
1248 * have pending locks
1249 *
1250 * Returns: <void>
1251 *
1252 * Notes: This function iterates a list of locks and wakes all waiters,
1253 * rather than only waiters for the contended regions. Because
1254 * of this, for heavily contended files, this can result in a
1255 * "thundering herd" situation. Refactoring the code could make
1256 * this operation more efficient, if heavy contention ever results
1257 * in a real-world performance problem.
1c79356b 1258 */
91447636 1259static void
c910b4d9 1260lf_wakelock(struct lockf *listhead, boolean_t force_all)
1c79356b 1261{
91447636 1262 struct lockf *wakelock;
c910b4d9
A
1263 boolean_t wake_all = TRUE;
1264
b0d623f7 1265 if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE))
c910b4d9 1266 wake_all = FALSE;
1c79356b 1267
91447636
A
1268 while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1269 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1c79356b 1270 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
c910b4d9 1271
1c79356b 1272 wakelock->lf_next = NOLOCKF;
2d21ac55 1273#ifdef LOCKF_DEBUGGING
3e170ce0 1274 if (lockf_debug & LF_DBG_LOCKOP)
1c79356b 1275 lf_print("lf_wakelock: awakening", wakelock);
2d21ac55 1276#endif /* LOCKF_DEBUGGING */
c910b4d9 1277 if (wake_all == FALSE) {
b0d623f7
A
1278 /*
1279 * If there are items on the list head block list,
1280 * move them to the wakelock list instead, and then
1281 * correct their lf_next pointers.
1282 */
1283 if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1284 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
c910b4d9 1285
c910b4d9
A
1286 struct lockf *tlock;
1287
1288 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
99c3a104
A
1289 if (TAILQ_NEXT(tlock, lf_block) == tlock) {
1290 /* See rdar://10887303 */
1291 panic("cycle in wakelock list");
1292 }
c910b4d9
A
1293 tlock->lf_next = wakelock;
1294 }
1295 }
1296 }
91447636 1297 wakeup(wakelock);
c910b4d9
A
1298
1299 if (wake_all == FALSE)
1300 break;
1c79356b
A
1301 }
1302}
1303
2d21ac55
A
1304
1305#ifdef LOCKF_DEBUGGING
3e170ce0
A
1306#define GET_LF_OWNER_PID(lf) (proc_pid((lf)->lf_owner))
1307
1c79356b 1308/*
2d21ac55
A
1309 * lf_print DEBUG
1310 *
1311 * Print out a lock; lock information is prefixed by the string in 'tag'
1312 *
1313 * Parameters: tag A string tag for debugging
1314 * lock The lock whose information should be
1315 * displayed
1316 *
1317 * Returns: <void>
1c79356b 1318 */
91447636 1319void
2d21ac55 1320lf_print(const char *tag, struct lockf *lock)
1c79356b 1321{
91447636 1322 printf("%s: lock %p for ", tag, (void *)lock);
1c79356b 1323 if (lock->lf_flags & F_POSIX)
3e170ce0
A
1324 printf("proc %p (owner %d)",
1325 lock->lf_id, GET_LF_OWNER_PID(lock));
1326 else if (lock->lf_flags & F_OFD_LOCK)
1327 printf("fg %p (owner %d)",
1328 lock->lf_id, GET_LF_OWNER_PID(lock));
91447636
A
1329 else
1330 printf("id %p", (void *)lock->lf_id);
1331 if (lock->lf_vnode != 0)
2d21ac55 1332 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
91447636
A
1333 lock->lf_vnode,
1334 lock->lf_type == F_RDLCK ? "shared" :
1335 lock->lf_type == F_WRLCK ? "exclusive" :
1336 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1337 (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1c79356b 1338 else
2d21ac55 1339 printf(" %s, start 0x%016llx, end 0x%016llx",
91447636
A
1340 lock->lf_type == F_RDLCK ? "shared" :
1341 lock->lf_type == F_WRLCK ? "exclusive" :
1342 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1343 (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1344 if (!TAILQ_EMPTY(&lock->lf_blkhd))
1345 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1c79356b
A
1346 else
1347 printf("\n");
1348}
1349
2d21ac55
A
1350
1351/*
1352 * lf_printlist DEBUG
1353 *
1354 * Print out a lock list for the vnode associated with 'lock'; lock information
1355 * is prefixed by the string in 'tag'
1356 *
1357 * Parameters: tag A string tag for debugging
1358 * lock The lock whose vnode's lock list should
1359 * be displayed
1360 *
1361 * Returns: <void>
1362 */
91447636 1363void
2d21ac55 1364lf_printlist(const char *tag, struct lockf *lock)
1c79356b 1365{
91447636
A
1366 struct lockf *lf, *blk;
1367
1368 if (lock->lf_vnode == 0)
1369 return;
1370
2d21ac55 1371 printf("%s: Lock list for vno %p:\n",
91447636
A
1372 tag, lock->lf_vnode);
1373 for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1374 printf("\tlock %p for ",(void *)lf);
1c79356b 1375 if (lf->lf_flags & F_POSIX)
3e170ce0
A
1376 printf("proc %p (owner %d)",
1377 lf->lf_id, GET_LF_OWNER_PID(lf));
1378 else if (lf->lf_flags & F_OFD_LOCK)
1379 printf("fg %p (owner %d)",
1380 lf->lf_id, GET_LF_OWNER_PID(lf));
1c79356b 1381 else
91447636 1382 printf("id %p", (void *)lf->lf_id);
2d21ac55 1383 printf(", %s, start 0x%016llx, end 0x%016llx",
91447636
A
1384 lf->lf_type == F_RDLCK ? "shared" :
1385 lf->lf_type == F_WRLCK ? "exclusive" :
1386 lf->lf_type == F_UNLCK ? "unlock" :
1387 "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
1388 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1389 printf("\n\t\tlock request %p for ", (void *)blk);
1c79356b 1390 if (blk->lf_flags & F_POSIX)
3e170ce0
A
1391 printf("proc %p (owner %d)",
1392 blk->lf_id, GET_LF_OWNER_PID(blk));
1393 else if (blk->lf_flags & F_OFD_LOCK)
1394 printf("fg %p (owner %d)",
1395 blk->lf_id, GET_LF_OWNER_PID(blk));
1c79356b 1396 else
91447636 1397 printf("id %p", (void *)blk->lf_id);
2d21ac55 1398 printf(", %s, start 0x%016llx, end 0x%016llx",
91447636
A
1399 blk->lf_type == F_RDLCK ? "shared" :
1400 blk->lf_type == F_WRLCK ? "exclusive" :
1401 blk->lf_type == F_UNLCK ? "unlock" :
1402 "unknown", (intmax_t)blk->lf_start,
1403 (intmax_t)blk->lf_end);
1404 if (!TAILQ_EMPTY(&blk->lf_blkhd))
1c79356b
A
1405 panic("lf_printlist: bad list");
1406 }
1407 printf("\n");
1408 }
1409}
2d21ac55 1410#endif /* LOCKF_DEBUGGING */
39236c6e
A
1411
1412#if IMPORTANCE_INHERITANCE
1413
1414/*
1415 * lf_hold_assertion
1416 *
1417 * Call task importance hold assertion on the owner of the lock.
1418 *
1419 * Parameters: block_task Owner of the lock blocking
1420 * current thread.
1421 *
1422 * block lock on which the current thread
1423 * is blocking on.
1424 *
1425 * Returns: <void>
1426 *
1427 * Notes: The task reference on block_task is not needed to be hold since
1428 * the current thread has vnode lock and block_task has a file
1429 * lock, thus removing file lock in exit requires block_task to
1430 * grab the vnode lock.
1431 */
1432static void
1433lf_hold_assertion(task_t block_task, struct lockf *block)
1434{
fe8ab488
A
1435 if (task_importance_hold_file_lock_assertion(block_task, 1)) {
1436 block->lf_boosted = LF_BOOSTED;
3e170ce0
A
1437 LOCKF_DEBUG(LF_DBG_IMPINH,
1438 "lf: importance hold file lock assert on pid %d lock %p\n",
1439 proc_pid(block->lf_owner), block);
fe8ab488 1440 }
39236c6e
A
1441}
1442
1443
1444/*
1445 * lf_jump_to_queue_head
1446 *
1447 * Jump the lock from the tail of the block queue to the head of
1448 * the queue.
1449 *
1450 * Parameters: block lockf struct containing the
1451 * block queue.
1452 * lock lockf struct to be jumped to the
1453 * front.
1454 *
1455 * Returns: <void>
1456 */
1457static void
1458lf_jump_to_queue_head(struct lockf *block, struct lockf *lock)
1459{
1460 /* Move the lock to the head of the block queue. */
1461 TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
1462 TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
1463}
1464
1465
1466/*
1467 * lf_drop_assertion
1468 *
1469 * Drops the task hold assertion.
1470 *
1471 * Parameters: block lockf struct holding the assertion.
1472 *
1473 * Returns: <void>
1474 */
1475static void
1476lf_drop_assertion(struct lockf *block)
1477{
3e170ce0
A
1478 LOCKF_DEBUG(LF_DBG_IMPINH, "lf: %d: dropping assertion for lock %p\n",
1479 proc_pid(block->lf_owner), block);
39236c6e 1480
3e170ce0 1481 task_t current_task = proc_task(block->lf_owner);
fe8ab488 1482 task_importance_drop_file_lock_assertion(current_task, 1);
39236c6e
A
1483 block->lf_boosted = LF_NOT_BOOSTED;
1484}
1485
3e170ce0
A
1486static void
1487lf_boost_blocking_proc(struct lockf *lock, struct lockf *block)
1488{
1489 task_t ltask = proc_task(lock->lf_owner);
1490 task_t btask = proc_task(block->lf_owner);
1491
1492 /*
1493 * Check if ltask can donate importance. The
1494 * check of imp_donor bit is done without holding
1495 * any lock. The value may change after you read it,
1496 * but it is ok to boost a task while someone else is
1497 * unboosting you.
1498 *
1499 * TODO: Support live inheritance on file locks.
1500 */
1501 if (task_is_importance_donor(ltask)) {
1502 LOCKF_DEBUG(LF_DBG_IMPINH,
1503 "lf: %d: attempt to boost pid %d that holds lock %p\n",
1504 proc_pid(lock->lf_owner), proc_pid(block->lf_owner), block);
1505
1506 if (block->lf_boosted != LF_BOOSTED &&
1507 task_is_importance_receiver_type(btask)) {
1508 lf_hold_assertion(btask, block);
1509 }
1510 lf_jump_to_queue_head(block, lock);
1511 }
1512}
39236c6e 1513#endif /* IMPORTANCE_INHERITANCE */