]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/kern/kern_lockf.c
xnu-7195.50.7.100.1.tar.gz
[apple/xnu.git] / bsd / kern / kern_lockf.c
index 1ef3470cec9423d2c532851dc283299e614a5a4b..782346ed20ae8701523e1f071b5d1786b89ad240 100644 (file)
@@ -1,3 +1,30 @@
+/*
+ * Copyright (c) 2019-2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *     The Regents of the University of California.  All rights reserved.
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
+#include <sys/signalvar.h>
 #include <sys/unistd.h>
+#include <sys/user.h>
 #include <sys/vnode.h>
 #include <sys/vnode_internal.h>
 #include <sys/vnode_if.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
+#include <sys/sdt.h>
+#include <kern/policy_internal.h>
 
-#if DEAD_CODE
-/*
- * This variable controls the maximum number of processes that will
- * be checked in doing deadlock detection.
- */
-static int maxlockdepth = MAXDEPTH;
-#endif /* DEAD_CODE */
+#include <sys/file_internal.h>
 
-#ifdef LOCKF_DEBUG
+#if (DEVELOPMENT || DEBUG)
+#define LOCKF_DEBUGGING 1
+#endif
+
+#ifdef LOCKF_DEBUGGING
 #include <sys/sysctl.h>
+void lf_print(const char *tag, struct lockf *lock);
+void lf_printlist(const char *tag, struct lockf *lock);
 
-#include <ufs/ufs/quota.h>
-#include <ufs/ufs/inode.h>
+#define LF_DBG_LOCKOP   (1 << 0)        /* setlk, getlk, clearlk */
+#define LF_DBG_LIST     (1 << 1)        /* split, coalesce */
+#define LF_DBG_IMPINH   (1 << 2)        /* importance inheritance */
+#define LF_DBG_TRACE    (1 << 3)        /* errors, exit */
+#define LF_DBG_DEADLOCK (1 << 4)        /* deadlock detection */
 
+static int      lockf_debug = 0;        /* was 2, could be 3 ;-) */
+SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
 
-static int     lockf_debug = 2;
-SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
-#endif
+/*
+ * If the selector is set, then output the debugging diagnostic.
+ */
+#define LOCKF_DEBUG(mask, ...)                                  \
+       do {                                                    \
+               if ((mask) & lockf_debug) {        \
+                       printf("%s>", __FUNCTION__);            \
+                       printf(__VA_ARGS__);                    \
+               }                                               \
+       } while(0)
+
+#define LOCKF_DEBUGP(mask)                                      \
+       ({                                                      \
+               ((mask) & lockf_debug);                         \
+       })
+#else   /* !LOCKF_DEBUGGING */
+#define LOCKF_DEBUG(mask, ...)          /* mask */
+#endif  /* !LOCKF_DEBUGGING */
 
 MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
 
 #define NOLOCKF (struct lockf *)0
-#define SELF   0x1
-#define OTHERS 0x2
-#define OFF_MAX        0x7fffffffffffffffULL   /* max off_t */
-static int      lf_clearlock(struct lockf *);
-static int      lf_findoverlap(struct lockf *,
-           struct lockf *, int, struct lockf ***, struct lockf **);
-static struct lockf *
-        lf_getblock(struct lockf *);
-static int      lf_getlock(struct lockf *, struct flock *);
-static int      lf_setlock(struct lockf *);
-static void     lf_split(struct lockf *, struct lockf *);
-static void     lf_wakelock(struct lockf *);
+#define SELF    0x1
+#define OTHERS  0x2
+#define OFF_MAX 0x7fffffffffffffffULL   /* max off_t */
+
+/*
+ * Overlapping lock states
+ */
+typedef enum {
+       OVERLAP_NONE = 0,
+       OVERLAP_EQUALS_LOCK,
+       OVERLAP_CONTAINS_LOCK,
+       OVERLAP_CONTAINED_BY_LOCK,
+       OVERLAP_STARTS_BEFORE_LOCK,
+       OVERLAP_ENDS_AFTER_LOCK
+} overlap_t;
+
+static int       lf_clearlock(struct lockf *);
+static overlap_t lf_findoverlap(struct lockf *,
+    struct lockf *, int, struct lockf ***, struct lockf **);
+static struct lockf *lf_getblock(struct lockf *, pid_t);
+static int       lf_getlock(struct lockf *, struct flock *, pid_t);
+static int       lf_setlock(struct lockf *, struct timespec *);
+static int       lf_split(struct lockf *, struct lockf *);
+static void      lf_wakelock(struct lockf *, boolean_t);
+#if IMPORTANCE_INHERITANCE
+static void      lf_hold_assertion(task_t, struct lockf *);
+static void      lf_jump_to_queue_head(struct lockf *, struct lockf *);
+static void      lf_drop_assertion(struct lockf *);
+static void      lf_boost_blocking_proc(struct lockf *, struct lockf *);
+static void      lf_adjust_assertion(struct lockf *block);
+#endif /* IMPORTANCE_INHERITANCE */
+
+static lck_mtx_t lf_dead_lock;
+static lck_grp_t *lf_dead_lock_grp;
+
+void
+lf_init(void)
+{
+       lf_dead_lock_grp = lck_grp_alloc_init("lf_dead_lock", LCK_GRP_ATTR_NULL);
+       lck_mtx_init(&lf_dead_lock, lf_dead_lock_grp, LCK_ATTR_NULL);
+}
 
 /*
- * Advisory record locking support
+ * lf_advlock
+ *
+ * Description:        Advisory record locking support
+ *
+ * Parameters: ap                      Argument pointer to a vnop_advlock_args
+ *                                     argument descriptor structure for the
+ *                                     lock operation to be attempted.
+ *
+ * Returns:    0                       Success
+ *             EOVERFLOW
+ *             EINVAL
+ *             ENOLCK                  Number of locked regions exceeds limit
+ *     lf_setlock:EAGAIN
+ *     lf_setlock:EDEADLK
+ *     lf_setlock:EINTR
+ *     lf_setlock:ENOLCK
+ *     lf_setlock:ETIMEDOUT
+ *     lf_clearlock:ENOLCK
+ *     vnode_size:???
+ *
+ * Notes:      We return ENOLCK when we run out of memory to support locks; as
+ *             such, there is no specific expectation limit other than the
+ *             amount of available resources.
  */
 int
-lf_advlock(ap)
-       struct vnop_advlock_args /* {
-               struct vnode *a_vp;
-               caddr_t  a_id;
-               int  a_op;
-               struct flock *a_fl;
-               int  a_flags;
-               vfs_context_t a_context;
-       } */ *ap;
+lf_advlock(struct vnop_advlock_args *ap)
 {
        struct vnode *vp = ap->a_vp;
        struct flock *fl = ap->a_fl;
@@ -113,10 +207,10 @@ lf_advlock(ap)
        if (*head == (struct lockf *)0) {
                if (ap->a_op != F_SETLK) {
                        fl->l_type = F_UNLCK;
-#ifdef LOCKF_DEBUG
-               printf("lf_advlock: unlock without lock\n");
-#endif /* LOCKF_DEBUG */
-                       return (0);
+                       LOCKF_DEBUG(LF_DBG_TRACE,
+                           "lf_advlock: '%s' unlock without lock\n",
+                           vfs_context_proc(context)->p_comm);
+                       return 0;
                }
        }
 
@@ -124,7 +218,6 @@ lf_advlock(ap)
         * Convert the flock structure into a start and end.
         */
        switch (fl->l_whence) {
-
        case SEEK_SET:
        case SEEK_CUR:
                /*
@@ -136,67 +229,66 @@ lf_advlock(ap)
 
        case SEEK_END:
 
-               if ((error = vnode_size(vp, &size, context)))
-{
-#ifdef LOCKF_DEBUG
-               printf("lf_advlock: vnode_getattr failed: %d\n", error);
-#endif /* LOCKF_DEBUG */
-                       return (error);
-}
+               /*
+                * It's OK to cast the u_quad_t to and off_t here, since they
+                * are the same storage size, and the value of the returned
+                * contents will never overflow into the sign bit.  We need to
+                * do this because we will use size to force range checks.
+                */
+               if ((error = vnode_size(vp, (off_t *)&size, context))) {
+                       LOCKF_DEBUG(LF_DBG_TRACE,
+                           "lf_advlock: vnode_getattr failed: %d\n", error);
+                       return error;
+               }
 
                if (size > OFF_MAX ||
-                   (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
-                       return (EOVERFLOW);
+                   (fl->l_start > 0 &&
+                   size > (u_quad_t)(OFF_MAX - fl->l_start))) {
+                       return EOVERFLOW;
+               }
                start = size + fl->l_start;
                break;
 
        default:
-#ifdef LOCKF_DEBUG
-               printf("lf_advlock: unknown whence %d\n", fl->l_whence);
-#endif /* LOCKF_DEBUG */
-               return (EINVAL);
+               LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: unknown whence %d\n",
+                   fl->l_whence);
+               return EINVAL;
+       }
+       if (start < 0) {
+               LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: start < 0 (%qd)\n",
+                   start);
+               return EINVAL;
        }
-       if (start < 0)
-{
-#ifdef LOCKF_DEBUG
-               printf("lf_advlock: start < 0 (%qd)\n", start);
-#endif /* LOCKF_DEBUG */
-               return (EINVAL);
-}
        if (fl->l_len < 0) {
-               if (start == 0)
-{
-#ifdef LOCKF_DEBUG
-               printf("lf_advlock: len < 0 & start == 0\n");
-#endif /* LOCKF_DEBUG */
-                       return (EINVAL);
-}
+               if (start == 0) {
+                       LOCKF_DEBUG(LF_DBG_TRACE,
+                           "lf_advlock: len < 0 & start == 0\n");
+                       return EINVAL;
+               }
                end = start - 1;
                start += fl->l_len;
-               if (start < 0)
-{
-#ifdef LOCKF_DEBUG
-               printf("lf_advlock: start < 0 (%qd)\n", start);
-#endif /* LOCKF_DEBUG */
-                       return (EINVAL);
-}
-       } else if (fl->l_len == 0)
+               if (start < 0) {
+                       LOCKF_DEBUG(LF_DBG_TRACE,
+                           "lf_advlock: start < 0 (%qd)\n", start);
+                       return EINVAL;
+               }
+       } else if (fl->l_len == 0) {
                end = -1;
-       else {
+       else {
                oadd = fl->l_len - 1;
-               if (oadd > (off_t)(OFF_MAX - start))
-{
-#ifdef LOCKF_DEBUG
-               printf("lf_advlock: overflow\n");
-#endif /* LOCKF_DEBUG */
-                       return (EOVERFLOW);
-}
+               if (oadd > (off_t)(OFF_MAX - start)) {
+                       LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: overflow\n");
+                       return EOVERFLOW;
+               }
                end = start + oadd;
        }
        /*
         * Create the lockf structure
         */
        MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+       if (lock == NULL) {
+               return ENOLCK;
+       }
        lock->lf_start = start;
        lock->lf_end = end;
        lock->lf_id = ap->a_id;
@@ -205,15 +297,41 @@ lf_advlock(ap)
        lock->lf_head = head;
        lock->lf_next = (struct lockf *)0;
        TAILQ_INIT(&lock->lf_blkhd);
-       lock->lf_flags = ap->a_flags;
+       lock->lf_flags = (short)ap->a_flags;
+#if IMPORTANCE_INHERITANCE
+       lock->lf_boosted = LF_NOT_BOOSTED;
+#endif
+       if (ap->a_flags & F_POSIX) {
+               lock->lf_owner = (struct proc *)lock->lf_id;
+       } else {
+               lock->lf_owner = NULL;
+       }
+
+       if (ap->a_flags & F_FLOCK) {
+               lock->lf_flags |= F_WAKE1_SAFE;
+       }
 
-       lck_mtx_lock(&vp->v_lock);      /* protect the lockf list */
+       lck_mtx_lock(&vp->v_lock);      /* protect the lockf list */
        /*
         * Do the requested operation.
         */
-       switch(ap->a_op) {
+       switch (ap->a_op) {
        case F_SETLK:
-               error = lf_setlock(lock);
+               /*
+                * For F_OFD_* locks, lf_id is the fileglob.
+                * Record an "lf_owner" iff this is a confined fd
+                * i.e. it cannot escape this process and will be
+                * F_UNLCKed before the owner exits.  (This is
+                * the implicit guarantee needed to ensure lf_owner
+                * remains a valid reference here.)
+                */
+               if (ap->a_flags & F_OFD_LOCK) {
+                       struct fileglob *fg = (void *)lock->lf_id;
+                       if (fg->fg_lflags & FG_CONFINED) {
+                               lock->lf_owner = current_proc();
+                       }
+               }
+               error = lf_setlock(lock, ap->a_timeout);
                break;
 
        case F_UNLCK:
@@ -222,7 +340,12 @@ lf_advlock(ap)
                break;
 
        case F_GETLK:
-               error = lf_getlock(lock, fl);
+               error = lf_getlock(lock, fl, -1);
+               FREE(lock, M_LOCKF);
+               break;
+
+       case F_GETLKPID:
+               error = lf_getlock(lock, fl, fl->l_pid);
                FREE(lock, M_LOCKF);
                break;
 
@@ -231,96 +354,338 @@ lf_advlock(ap)
                error = EINVAL;
                break;
        }
-       lck_mtx_unlock(&vp->v_lock);    /* done maniplulating the list */
+       lck_mtx_unlock(&vp->v_lock);    /* done manipulating the list */
 
-#ifdef LOCKF_DEBUG
-       printf("lf_advlock: normal exit: %d\n", error);
-#endif /* LOCKF_DEBUG */
-       return (error);
+       LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: normal exit: %d\n", error);
+       return error;
 }
 
 /*
- * Set a byte-range lock.
+ * Empty the queue of msleeping requests for a lock on the given vnode.
+ * Called with the vnode already locked.  Used for forced unmount, where
+ * a flock(2) invoker sleeping on a blocked lock holds an iocount reference
+ * that prevents the vnode from ever being drained.  Force unmounting wins.
  */
-static int
-lf_setlock(lock)
+void
+lf_abort_advlocks(vnode_t vp)
+{
        struct lockf *lock;
+
+       if ((lock = vp->v_lockf) == NULL) {
+               return;
+       }
+
+       lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
+
+       if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
+               struct lockf *tlock;
+
+               TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
+                       /*
+                        * Setting this flag should cause all
+                        * currently blocked F_SETLK request to
+                        * return to userland with an errno.
+                        */
+                       tlock->lf_flags |= F_ABORT;
+               }
+               lf_wakelock(lock, TRUE);
+       }
+}
+
+/*
+ * Take any lock attempts which are currently blocked by a given lock ("from")
+ * and mark them as blocked by a different lock ("to").  Used in the case
+ * where a byte range currently occupied by "from" is to be occupied by "to."
+ */
+static void
+lf_move_blocked(struct lockf *to, struct lockf *from)
+{
+       struct lockf *tlock;
+
+       TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
+               tlock->lf_next = to;
+       }
+
+       TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
+}
+
+/*
+ * lf_coalesce_adjacent
+ *
+ * Description:        Helper function: when setting a lock, coalesce adjacent
+ *             locks.  Needed because adjacent locks are not overlapping,
+ *             but POSIX requires that they be coalesced.
+ *
+ * Parameters: lock                    The new lock which may be adjacent
+ *                                     to already locked regions, and which
+ *                                     should therefore be coalesced with them
+ *
+ * Returns:    <void>
+ */
+static void
+lf_coalesce_adjacent(struct lockf *lock)
+{
+       struct lockf **lf = lock->lf_head;
+
+       while (*lf != NOLOCKF) {
+               /* reject locks that obviously could not be coalesced */
+               if ((*lf == lock) ||
+                   ((*lf)->lf_id != lock->lf_id) ||
+                   ((*lf)->lf_type != lock->lf_type)) {
+                       lf = &(*lf)->lf_next;
+                       continue;
+               }
+
+               /*
+                * NOTE: Assumes that if two locks are adjacent on the number line
+                * and belong to the same owner, then they are adjacent on the list.
+                */
+               if ((*lf)->lf_end != -1 &&
+                   ((*lf)->lf_end + 1) == lock->lf_start) {
+                       struct lockf *adjacent = *lf;
+
+                       LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent previous\n");
+                       lock->lf_start = (*lf)->lf_start;
+                       *lf = lock;
+                       lf = &(*lf)->lf_next;
+
+                       lf_move_blocked(lock, adjacent);
+
+                       FREE(adjacent, M_LOCKF);
+                       continue;
+               }
+               /* If the lock starts adjacent to us, we can coalesce it */
+               if (lock->lf_end != -1 &&
+                   (lock->lf_end + 1) == (*lf)->lf_start) {
+                       struct lockf *adjacent = *lf;
+
+                       LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent following\n");
+                       lock->lf_end = (*lf)->lf_end;
+                       lock->lf_next = (*lf)->lf_next;
+                       lf = &lock->lf_next;
+
+                       lf_move_blocked(lock, adjacent);
+
+                       FREE(adjacent, M_LOCKF);
+                       continue;
+               }
+
+               /* no matching conditions; go on to next lock */
+               lf = &(*lf)->lf_next;
+       }
+}
+
+/*
+ * lf_setlock
+ *
+ * Description:        Set a byte-range lock.
+ *
+ * Parameters: lock                    The lock structure describing the lock
+ *                                     to be set; allocated by the caller, it
+ *                                     will be linked into the lock list if
+ *                                     the set is successful, and freed if the
+ *                                     set is unsuccessful.
+ *
+ *             timeout                 Timeout specified in the case of
+ *                                      SETLKWTIMEOUT.
+ *
+ * Returns:    0                       Success
+ *             EAGAIN
+ *             EDEADLK
+ *     lf_split:ENOLCK
+ *     lf_clearlock:ENOLCK
+ *     msleep:EINTR
+ *     msleep:ETIMEDOUT
+ *
+ * Notes:      We add the lock to the provisional lock list.  We do not
+ *             coalesce at this time; this has implications for other lock
+ *             requestors in the blocker search mechanism.
+ */
+static int
+lf_setlock(struct lockf *lock, struct timespec *timeout)
 {
        struct lockf *block;
        struct lockf **head = lock->lf_head;
        struct lockf **prev, *overlap, *ltmp;
-       static char lockstr[] = "lockf";
-       int ovcase, priority, needtolink, error;
+       static const char lockstr[] = "lockf";
+       int priority, needtolink, error;
        struct vnode *vp = lock->lf_vnode;
+       overlap_t ovcase;
 
-#ifdef LOCKF_DEBUG
-       if (lockf_debug & 1)
+#ifdef LOCKF_DEBUGGING
+       if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                lf_print("lf_setlock", lock);
-#endif /* LOCKF_DEBUG */
+               lf_printlist("lf_setlock(in)", lock);
+       }
+#endif /* LOCKF_DEBUGGING */
+       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p Looking for deadlock, vnode %p\n", lock, lock->lf_vnode);
 
        /*
         * Set the priority
         */
        priority = PLOCK;
-       if (lock->lf_type == F_WRLCK)
+       if (lock->lf_type == F_WRLCK) {
                priority += 4;
+       }
        priority |= PCATCH;
+scan:
        /*
         * Scan lock list for this file looking for locks that would block us.
         */
-       while ((block = lf_getblock(lock))) {
+       while ((block = lf_getblock(lock, -1))) {
                /*
                 * Free the structure and return if nonblocking.
                 */
                if ((lock->lf_flags & F_WAIT) == 0) {
+                       DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
                        FREE(lock, M_LOCKF);
-                       return (EAGAIN);
+                       return EAGAIN;
                }
-#if DEAD_CODE
-/*
- * XXX This is dead code on MacOS X; it shouldn't be.
- */
+
+               LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p found blocking lock %p\n", lock, block);
+
                /*
                 * We are blocked. Since flock style locks cover
                 * the whole file, there is no chance for deadlock.
-                * For byte-range locks we must check for deadlock.
+                *
+                * OFD byte-range locks currently do NOT support
+                * deadlock detection.
+                *
+                * For POSIX byte-range locks we must check for deadlock.
                 *
                 * Deadlock detection is done by looking through the
                 * wait channels to see if there are any cycles that
-                * involve us. MAXDEPTH is set just to make sure we
-                * do not go off into neverland.
+                * involve us.
                 */
                if ((lock->lf_flags & F_POSIX) &&
                    (block->lf_flags & F_POSIX)) {
-                       struct proc *wproc;
-                       struct thread *td;
-                       struct lockf *waitblock;
-                       int i = 0;
-
-                       /* The block is waiting on something */
-                       /* XXXKSE this is not complete under threads */
-                       wproc = (struct proc *)block->lf_id;
-                       mtx_lock_spin(&sched_lock);
-                       FOREACH_THREAD_IN_PROC(wproc, td) {
-                               while (td->td_wchan &&
-                                   (td->td_wmesg == lockstr) &&
-                                   (i++ < maxlockdepth)) {
-                                       waitblock = (struct lockf *)td->td_wchan;
-                                       /* Get the owner of the blocking lock */
-                                       waitblock = waitblock->lf_next;
-                                       if ((waitblock->lf_flags & F_POSIX) == 0)
-                                               break;
-                                       wproc = (struct proc *)waitblock->lf_id;
-                                       if (wproc == (struct proc *)lock->lf_id) {
-                                               mtx_unlock_spin(&sched_lock);
+                       lck_mtx_lock(&lf_dead_lock);
+
+                       /* The blocked process is waiting on something */
+                       struct proc *wproc = block->lf_owner;
+                       proc_lock(wproc);
+
+                       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(wproc));
+
+                       struct uthread *ut;
+                       TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
+                               /*
+                                * If the thread is (a) asleep (uu_wchan != 0)
+                                * and (b) in this code (uu_wmesg == lockstr)
+                                * then check to see if the lock is blocked behind
+                                * someone blocked behind us.
+                                *
+                                * Note: (i) vp->v_lock is held, preventing other
+                                * threads from mutating the blocking list for our vnode.
+                                * and (ii) the proc_lock is held i.e the thread list
+                                * is stable.
+                                *
+                                * HOWEVER some thread in wproc might be sleeping on a lockf
+                                * structure for a different vnode, and be woken at any
+                                * time. Thus the waitblock list could mutate while
+                                * it's being inspected by this thread, and what
+                                * ut->uu_wchan was just pointing at could even be freed.
+                                *
+                                * Nevertheless this is safe here because of lf_dead_lock; if
+                                * any thread blocked with uu_wmesg == lockstr wakes (see below)
+                                * it will try to acquire lf_dead_lock which is already held
+                                * here. Holding that lock prevents the lockf structure being
+                                * pointed at by ut->uu_wchan from going away. Thus the vnode
+                                * involved can be found and locked, and the corresponding
+                                * blocking chain can then be examined safely.
+                                */
+                               const struct lockf *waitblock = (const void *)ut->uu_wchan;
+                               if ((waitblock != NULL) && (ut->uu_wmesg == lockstr)) {
+                                       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode);
+
+                                       vnode_t othervp = NULL;
+                                       if (waitblock->lf_vnode != vp) {
+                                               /*
+                                                * This thread in wproc is waiting for a lock
+                                                * on a different vnode; grab the lock on it
+                                                * that protects lf_next while we examine it.
+                                                */
+                                               othervp = waitblock->lf_vnode;
+                                               if (!lck_mtx_try_lock(&othervp->v_lock)) {
+                                                       /*
+                                                        * avoid kernel deadlock: drop all
+                                                        * locks, pause for a bit to let the
+                                                        * other thread do what it needs to do,
+                                                        * then (because we drop and retake
+                                                        * v_lock) retry the scan.
+                                                        */
+                                                       proc_unlock(wproc);
+                                                       lck_mtx_unlock(&lf_dead_lock);
+                                                       static struct timespec ts = {
+                                                               .tv_sec = 0,
+                                                               .tv_nsec = 2 * NSEC_PER_MSEC,
+                                                       };
+                                                       static const char pausestr[] = "lockf:pause";
+                                                       (void) msleep(lock, &vp->v_lock, priority, pausestr, &ts);
+                                                       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p contention for vp %p => restart\n", lock, othervp);
+                                                       goto scan;
+                                               }
+                                       }
+
+                                       /*
+                                        * Get the lock blocking the lock
+                                        * which would block us, and make
+                                        * certain it hasn't become unblocked
+                                        * (been granted, e.g. between the time
+                                        * we called lf_getblock, and the time
+                                        * we successfully acquired the
+                                        * proc_lock).
+                                        */
+                                       const struct lockf *nextblock = waitblock->lf_next;
+                                       if (nextblock == NULL) {
+                                               if (othervp) {
+                                                       lck_mtx_unlock(&othervp->v_lock);
+                                               }
+                                               LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p with waitblock %p and no lf_next; othervp %p\n", lock, waitblock, othervp);
+                                               continue;
+                                       }
+                                       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, nextblock, nextblock->lf_vnode);
+
+                                       /*
+                                        * Make sure it's an advisory range
+                                        * lock and not any other kind of lock;
+                                        * if we mix lock types, it's our own
+                                        * fault.
+                                        */
+                                       if ((nextblock->lf_flags & F_POSIX) == 0) {
+                                               if (othervp) {
+                                                       lck_mtx_unlock(&othervp->v_lock);
+                                               }
+                                               continue;
+                                       }
+
+                                       /*
+                                        * If the owner of the lock that's
+                                        * blocking a lock that's blocking us
+                                        * getting the requested lock, then we
+                                        * would deadlock, so error out.
+                                        */
+                                       struct proc *bproc = nextblock->lf_owner;
+                                       const boolean_t deadlocked = bproc == lock->lf_owner;
+
+                                       if (othervp) {
+                                               lck_mtx_unlock(&othervp->v_lock);
+                                       }
+                                       LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(bproc));
+                                       if (deadlocked) {
+                                               LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock);
+                                               proc_unlock(wproc);
+                                               lck_mtx_unlock(&lf_dead_lock);
                                                FREE(lock, M_LOCKF);
-                                               return (EDEADLK);
+                                               return EDEADLK;
                                        }
                                }
+                               LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p bottom of thread loop\n", lock);
                        }
-                       mtx_unlock_spin(&sched_lock);
+                       proc_unlock(wproc);
+                       lck_mtx_unlock(&lf_dead_lock);
                }
-#endif /* DEAD_CODE */
+
                /*
                 * For flock type locks, we must first remove
                 * any shared locks that we hold before we sleep
@@ -329,7 +694,10 @@ lf_setlock(lock)
                if ((lock->lf_flags & F_FLOCK) &&
                    lock->lf_type == F_WRLCK) {
                        lock->lf_type = F_UNLCK;
-                       (void) lf_clearlock(lock);
+                       if ((error = lf_clearlock(lock)) != 0) {
+                               FREE(lock, M_LOCKF);
+                               return error;
+                       }
                        lock->lf_type = F_WRLCK;
                }
                /*
@@ -338,30 +706,108 @@ lf_setlock(lock)
                 */
                lock->lf_next = block;
                TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
-#ifdef LOCKF_DEBUG
-               if (lockf_debug & 1) {
+
+               if (!(lock->lf_flags & F_FLOCK)) {
+                       block->lf_flags &= ~F_WAKE1_SAFE;
+               }
+
+#if IMPORTANCE_INHERITANCE
+               /*
+                * Importance donation is done only for cases where the
+                * owning task can be unambiguously determined.
+                *
+                * POSIX type locks are not inherited by child processes;
+                * we maintain a 1:1 mapping between a lock and its owning
+                * process.
+                *
+                * Flock type locks are inherited across fork() and there is
+                * no 1:1 mapping in the general case.  However, the fileglobs
+                * used by OFD locks *may* be confined to the process that
+                * created them, and thus have an "owner", in which case
+                * we also attempt importance donation.
+                */
+               if ((lock->lf_flags & block->lf_flags & F_POSIX) != 0) {
+                       lf_boost_blocking_proc(lock, block);
+               } else if ((lock->lf_flags & block->lf_flags & F_OFD_LOCK) &&
+                   lock->lf_owner != block->lf_owner &&
+                   NULL != lock->lf_owner && NULL != block->lf_owner) {
+                       lf_boost_blocking_proc(lock, block);
+               }
+#endif /* IMPORTANCE_INHERITANCE */
+
+#ifdef LOCKF_DEBUGGING
+               if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                        lf_print("lf_setlock: blocking on", block);
-                       lf_printlist("lf_setlock", block);
+                       lf_printlist("lf_setlock(block)", block);
                }
-#endif /* LOCKF_DEBUG */
-               error = msleep(lock, &vp->v_lock, priority, lockstr, 0);
-               if (error) {    /* XXX */
+#endif /* LOCKF_DEBUGGING */
+               DTRACE_FSINFO(advlock__wait, vnode_t, vp);
+
+               if (lock->lf_flags & F_POSIX) {
+                       error = msleep(lock, &vp->v_lock, priority, lockstr, timeout);
+                       /*
+                        * Ensure that 'lock' doesn't get mutated or freed if a
+                        * wakeup occurs while hunting for deadlocks (and holding
+                        * lf_dead_lock - see above)
+                        */
+                       lck_mtx_lock(&lf_dead_lock);
+                       lck_mtx_unlock(&lf_dead_lock);
+               } else {
+                       static const char lockstr_np[] = "lockf:np";
+                       error = msleep(lock, &vp->v_lock, priority, lockstr_np, timeout);
+               }
+
+               if (error == 0 && (lock->lf_flags & F_ABORT) != 0) {
+                       error = EBADF;
+               }
+
+               if (lock->lf_next) {
+                       /*
+                        * lf_wakelock() always sets wakelock->lf_next to
+                        * NULL before a wakeup; so we've been woken early
+                        * - perhaps by a debugger, signal or other event.
+                        *
+                        * Remove 'lock' from the block list (avoids double-add
+                        * in the spurious case, which would create a cycle)
+                        */
+                       TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
+#if IMPORTANCE_INHERITANCE
                        /*
-                        * We may have been awakened by a signal and/or by a
-                        * debugger continuing us (in which cases we must remove
-                        * ourselves from the blocked list) and/or by another
-                        * process releasing a lock (in which case we have
-                        * already been removed from the blocked list and our
-                        * lf_next field set to NOLOCKF).
+                        * Adjust the boost on lf_next.
                         */
-                       if (lock->lf_next) {
-                               TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
-                               lock->lf_next = NOLOCKF;
+                       lf_adjust_assertion(lock->lf_next);
+#endif /* IMPORTANCE_INHERITANCE */
+                       lock->lf_next = NULL;
+
+                       if (error == 0) {
+                               /*
+                                * If this was a spurious wakeup, retry
+                                */
+                               printf("%s: spurious wakeup, retrying lock\n",
+                                   __func__);
+                               continue;
+                       }
+               }
+
+               if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
+                       if ((block = lf_getblock(lock, -1)) != NULL) {
+                               lf_move_blocked(block, lock);
+                       }
+               }
+
+               if (error) {
+                       if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
+                               lf_wakelock(lock, TRUE);
                        }
                        FREE(lock, M_LOCKF);
-                       return (error);
-               }       /* XXX */
+                       /* Return ETIMEDOUT if timeout occoured. */
+                       if (error == EWOULDBLOCK) {
+                               error = ETIMEDOUT;
+                       }
+                       return error;
+               }
        }
+
        /*
         * No blocks!!  Add the lock.  Note that we will
         * downgrade or upgrade any overlapping locks this
@@ -375,8 +821,9 @@ lf_setlock(lock)
        needtolink = 1;
        for (;;) {
                ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
-               if (ovcase)
+               if (ovcase) {
                        block = overlap->lf_next;
+               }
                /*
                 * Six cases:
                 *      0) no overlap
@@ -387,52 +834,62 @@ lf_setlock(lock)
                 *      5) overlap ends after lock
                 */
                switch (ovcase) {
-               case 0: /* no overlap */
+               case OVERLAP_NONE:
                        if (needtolink) {
                                *prev = lock;
                                lock->lf_next = overlap;
                        }
                        break;
 
-               case 1: /* overlap == lock */
+               case OVERLAP_EQUALS_LOCK:
                        /*
                         * If downgrading lock, others may be
                         * able to acquire it.
                         */
                        if (lock->lf_type == F_RDLCK &&
-                           overlap->lf_type == F_WRLCK)
-                               lf_wakelock(overlap);
+                           overlap->lf_type == F_WRLCK) {
+                               lf_wakelock(overlap, TRUE);
+                       }
                        overlap->lf_type = lock->lf_type;
                        FREE(lock, M_LOCKF);
-                       lock = overlap; /* for debug output below */
+                       lock = overlap; /* for lf_coalesce_adjacent() */
                        break;
 
-               case 2: /* overlap contains lock */
+               case OVERLAP_CONTAINS_LOCK:
                        /*
                         * Check for common starting point and different types.
                         */
                        if (overlap->lf_type == lock->lf_type) {
                                FREE(lock, M_LOCKF);
-                               lock = overlap; /* for debug output below */
+                               lock = overlap; /* for lf_coalesce_adjacent() */
                                break;
                        }
                        if (overlap->lf_start == lock->lf_start) {
                                *prev = lock;
                                lock->lf_next = overlap;
                                overlap->lf_start = lock->lf_end + 1;
-                       } else
-                               lf_split(overlap, lock);
-                       lf_wakelock(overlap);
+                       } else {
+                               /*
+                                * If we can't split the lock, we can't
+                                * grant it.  Claim a system limit for the
+                                * resource shortage.
+                                */
+                               if (lf_split(overlap, lock)) {
+                                       FREE(lock, M_LOCKF);
+                                       return ENOLCK;
+                               }
+                       }
+                       lf_wakelock(overlap, TRUE);
                        break;
 
-               case 3: /* lock contains overlap */
+               case OVERLAP_CONTAINED_BY_LOCK:
                        /*
                         * If downgrading lock, others may be able to
                         * acquire it, otherwise take the list.
                         */
                        if (lock->lf_type == F_RDLCK &&
                            overlap->lf_type == F_WRLCK) {
-                               lf_wakelock(overlap);
+                               lf_wakelock(overlap, TRUE);
                        } else {
                                while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
                                        ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
@@ -451,12 +908,13 @@ lf_setlock(lock)
                                lock->lf_next = overlap->lf_next;
                                prev = &lock->lf_next;
                                needtolink = 0;
-                       } else
+                       } else {
                                *prev = overlap->lf_next;
+                       }
                        FREE(overlap, M_LOCKF);
                        continue;
 
-               case 4: /* overlap starts before lock */
+               case OVERLAP_STARTS_BEFORE_LOCK:
                        /*
                         * Add lock after overlap on the list.
                         */
@@ -464,11 +922,11 @@ lf_setlock(lock)
                        overlap->lf_next = lock;
                        overlap->lf_end = lock->lf_start - 1;
                        prev = &lock->lf_next;
-                       lf_wakelock(overlap);
+                       lf_wakelock(overlap, TRUE);
                        needtolink = 0;
                        continue;
 
-               case 5: /* overlap ends after lock */
+               case OVERLAP_ENDS_AFTER_LOCK:
                        /*
                         * Add the new lock before overlap.
                         */
@@ -477,301 +935,426 @@ lf_setlock(lock)
                                lock->lf_next = overlap;
                        }
                        overlap->lf_start = lock->lf_end + 1;
-                       lf_wakelock(overlap);
+                       lf_wakelock(overlap, TRUE);
                        break;
                }
                break;
        }
-#ifdef LOCKF_DEBUG
-       if (lockf_debug & 1) {
+       /* Coalesce adjacent locks with identical attributes */
+       lf_coalesce_adjacent(lock);
+#ifdef LOCKF_DEBUGGING
+       if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                lf_print("lf_setlock: got the lock", lock);
-               lf_printlist("lf_setlock", lock);
+               lf_printlist("lf_setlock(out)", lock);
        }
-#endif /* LOCKF_DEBUG */
-       return (0);
+#endif /* LOCKF_DEBUGGING */
+       return 0;
 }
 
+
 /*
- * Remove a byte-range lock on an inode.
+ * lf_clearlock
+ *
+ * Description:        Remove a byte-range lock on an vnode.  Generally, find the
+ *             lock (or an overlap to that lock) and remove it (or shrink
+ *             it), then wakeup anyone we can.
+ *
+ * Parameters: unlock                  The lock to clear
+ *
+ * Returns:    0                       Success
+ *     lf_split:ENOLCK
  *
- * Generally, find the lock (or an overlap to that lock)
- * and remove it (or shrink it), then wakeup anyone we can.
+ * Notes:      A caller may unlock all the locks owned by the caller by
+ *             specifying the entire file range; locks owned by other
+ *             callers are not effected by this operation.
  */
 static int
-lf_clearlock(unlock)
-       struct lockf *unlock;
+lf_clearlock(struct lockf *unlock)
 {
        struct lockf **head = unlock->lf_head;
        struct lockf *lf = *head;
        struct lockf *overlap, **prev;
-       int ovcase;
+       overlap_t ovcase;
 
-       if (lf == NOLOCKF)
-               return (0);
-#ifdef LOCKF_DEBUG
-       if (unlock->lf_type != F_UNLCK)
+       if (lf == NOLOCKF) {
+               return 0;
+       }
+#ifdef LOCKF_DEBUGGING
+       if (unlock->lf_type != F_UNLCK) {
                panic("lf_clearlock: bad type");
-       if (lockf_debug & 1)
+       }
+       if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                lf_print("lf_clearlock", unlock);
-#endif /* LOCKF_DEBUG */
+       }
+#endif /* LOCKF_DEBUGGING */
        prev = head;
-       while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) {
+       while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
                /*
                 * Wakeup the list of locks to be retried.
                 */
-               lf_wakelock(overlap);
+               lf_wakelock(overlap, FALSE);
+#if IMPORTANCE_INHERITANCE
+               if (overlap->lf_boosted == LF_BOOSTED) {
+                       lf_drop_assertion(overlap);
+               }
+#endif /* IMPORTANCE_INHERITANCE */
 
                switch (ovcase) {
+               case OVERLAP_NONE:      /* satisfy compiler enum/switch */
+                       break;
 
-               case 1: /* overlap == lock */
+               case OVERLAP_EQUALS_LOCK:
                        *prev = overlap->lf_next;
                        FREE(overlap, M_LOCKF);
                        break;
 
-               case 2: /* overlap contains lock: split it */
+               case OVERLAP_CONTAINS_LOCK: /* split it */
                        if (overlap->lf_start == unlock->lf_start) {
                                overlap->lf_start = unlock->lf_end + 1;
                                break;
                        }
-                       lf_split(overlap, unlock);
+                       /*
+                        * If we can't split the lock, we can't grant it.
+                        * Claim a system limit for the resource shortage.
+                        */
+                       if (lf_split(overlap, unlock)) {
+                               return ENOLCK;
+                       }
                        overlap->lf_next = unlock->lf_next;
                        break;
 
-               case 3: /* lock contains overlap */
+               case OVERLAP_CONTAINED_BY_LOCK:
                        *prev = overlap->lf_next;
                        lf = overlap->lf_next;
                        FREE(overlap, M_LOCKF);
                        continue;
 
-               case 4: /* overlap starts before lock */
+               case OVERLAP_STARTS_BEFORE_LOCK:
                        overlap->lf_end = unlock->lf_start - 1;
                        prev = &overlap->lf_next;
                        lf = overlap->lf_next;
                        continue;
 
-               case 5: /* overlap ends after lock */
+               case OVERLAP_ENDS_AFTER_LOCK:
                        overlap->lf_start = unlock->lf_end + 1;
                        break;
                }
                break;
        }
-#ifdef LOCKF_DEBUG
-       if (lockf_debug & 1)
+#ifdef LOCKF_DEBUGGING
+       if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                lf_printlist("lf_clearlock", unlock);
-#endif /* LOCKF_DEBUG */
-       return (0);
+       }
+#endif /* LOCKF_DEBUGGING */
+       return 0;
 }
 
+
 /*
- * Check whether there is a blocking lock,
- * and if so return its process identifier.
+ * lf_getlock
+ *
+ * Description:        Check whether there is a blocking lock, and if so return
+ *             its process identifier into the lock being requested.
+ *
+ * Parameters: lock                    Pointer to lock to test for blocks
+ *             fl                      Pointer to flock structure to receive
+ *                                     the blocking lock information, if a
+ *                                     blocking lock is found.
+ *             matchpid                -1, or pid value to match in lookup.
+ *
+ * Returns:    0                       Success
+ *
+ * Implicit Returns:
+ *             *fl                     Contents modified to reflect the
+ *                                     blocking lock, if one is found; not
+ *                                     modified otherwise
+ *
+ * Notes:      fl->l_pid will be (-1) for file locks and will only be set to
+ *             the blocking process ID for advisory record locks.
  */
 static int
-lf_getlock(lock, fl)
-       struct lockf *lock;
-       struct flock *fl;
+lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
 {
        struct lockf *block;
 
-#ifdef LOCKF_DEBUG
-       if (lockf_debug & 1)
+#ifdef LOCKF_DEBUGGING
+       if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                lf_print("lf_getlock", lock);
-#endif /* LOCKF_DEBUG */
+       }
+#endif /* LOCKF_DEBUGGING */
 
-       if ((block = lf_getblock(lock))) {
+       if ((block = lf_getblock(lock, matchpid))) {
                fl->l_type = block->lf_type;
                fl->l_whence = SEEK_SET;
                fl->l_start = block->lf_start;
-               if (block->lf_end == -1)
+               if (block->lf_end == -1) {
                        fl->l_len = 0;
-               else
+               } else {
                        fl->l_len = block->lf_end - block->lf_start + 1;
-               if (block->lf_flags & F_POSIX)
-                       fl->l_pid = proc_pid((struct proc *)(block->lf_id));
-               else
+               }
+               if (NULL != block->lf_owner) {
+                       /*
+                        * lf_owner is only non-NULL when the lock
+                        * "owner" can be unambiguously determined
+                        */
+                       fl->l_pid = proc_pid(block->lf_owner);
+               } else {
                        fl->l_pid = -1;
+               }
        } else {
                fl->l_type = F_UNLCK;
        }
-       return (0);
+       return 0;
 }
 
 /*
- * Walk the list of locks for an inode and
- * return the first blocking lock.
+ * lf_getblock
+ *
+ * Description:        Walk the list of locks for an inode and return the first
+ *             blocking lock.  A lock is considered blocking if we are not
+ *             the lock owner; otherwise, we are permitted to upgrade or
+ *             downgrade it, and it's not considered blocking.
+ *
+ * Parameters: lock                    The lock for which we are interested
+ *                                     in obtaining the blocking lock, if any
+ *             matchpid                -1, or pid value to match in lookup.
+ *
+ * Returns:    NOLOCKF                 No blocking lock exists
+ *             !NOLOCKF                The address of the blocking lock's
+ *                                     struct lockf.
  */
 static struct lockf *
-lf_getblock(lock)
-       struct lockf *lock;
+lf_getblock(struct lockf *lock, pid_t matchpid)
 {
        struct lockf **prev, *overlap, *lf = *(lock->lf_head);
-       int ovcase;
 
-       prev = lock->lf_head;
-       while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) {
+       for (prev = lock->lf_head;
+           lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
+           lf = overlap->lf_next) {
                /*
-                * We've found an overlap, see if it blocks us
+                * Found an overlap.
+                *
+                * If we're matching pids, and it's a record lock,
+                * or it's an OFD lock on a process-confined fd,
+                * but the pid doesn't match, then keep on looking ..
                 */
-               if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
-                       return (overlap);
+               if (matchpid != -1 &&
+                   (overlap->lf_flags & (F_POSIX | F_OFD_LOCK)) != 0 &&
+                   proc_pid(overlap->lf_owner) != matchpid) {
+                       continue;
+               }
+
                /*
-                * Nope, point to the next one on the list and
-                * see if it blocks us
+                * does it block us?
                 */
-               lf = overlap->lf_next;
+               if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) {
+                       return overlap;
+               }
        }
-       return (NOLOCKF);
+       return NOLOCKF;
 }
 
+
 /*
- * Walk the list of locks to
- * find an overlapping lock (if any).
+ * lf_findoverlap
+ *
+ * Description:        Walk the list of locks to find an overlapping lock (if any).
+ *
+ * Parameters: lf                      First lock on lock list
+ *             lock                    The lock we are checking for an overlap
+ *             check                   Check type
+ *             prev                    pointer to pointer pointer to contain
+ *                                     address of pointer to previous lock
+ *                                     pointer to overlapping lock, if overlap
+ *             overlap                 pointer to pointer to contain address
+ *                                     of overlapping lock
+ *
+ * Returns:    OVERLAP_NONE
+ *             OVERLAP_EQUALS_LOCK
+ *             OVERLAP_CONTAINS_LOCK
+ *             OVERLAP_CONTAINED_BY_LOCK
+ *             OVERLAP_STARTS_BEFORE_LOCK
+ *             OVERLAP_ENDS_AFTER_LOCK
+ *
+ * Implicit Returns:
+ *             *prev                   The address of the next pointer in the
+ *                                     lock previous to the overlapping lock;
+ *                                     this is generally used to relink the
+ *                                     lock list, avoiding a second iteration.
+ *             *overlap                The pointer to the overlapping lock
+ *                                     itself; this is used to return data in
+ *                                     the check == OTHERS case, and for the
+ *                                     caller to modify the overlapping lock,
+ *                                     in the check == SELF case
  *
- * NOTE: this returns only the FIRST overlapping lock.  There
- *      may be more than one.
+ * Note:       This returns only the FIRST overlapping lock.  There may be
+ *             more than one.  lf_getlock will return the first blocking lock,
+ *             while lf_setlock will iterate over all overlapping locks to
+ *
+ *             The check parameter can be SELF, meaning we are looking for
+ *             overlapping locks owned by us, or it can be OTHERS, meaning
+ *             we are looking for overlapping locks owned by someone else so
+ *             we can report a blocking lock on an F_GETLK request.
+ *
+ *             The value of *overlap and *prev are modified, even if there is
+ *             no overlapping lock found; always check the return code.
  */
-static int
-lf_findoverlap(lf, lock, type, prev, overlap)
-       struct lockf *lf;
-       struct lockf *lock;
-       int type;
-       struct lockf ***prev;
-       struct lockf **overlap;
+static overlap_t
+lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
+    struct lockf ***prev, struct lockf **overlap)
 {
        off_t start, end;
+       int found_self = 0;
 
        *overlap = lf;
-       if (lf == NOLOCKF)
-               return (0);
-#ifdef LOCKF_DEBUG
-       if (lockf_debug & 2)
+       if (lf == NOLOCKF) {
+               return 0;
+       }
+#ifdef LOCKF_DEBUGGING
+       if (LOCKF_DEBUGP(LF_DBG_LIST)) {
                lf_print("lf_findoverlap: looking for overlap in", lock);
-#endif /* LOCKF_DEBUG */
+       }
+#endif /* LOCKF_DEBUGGING */
        start = lock->lf_start;
        end = lock->lf_end;
        while (lf != NOLOCKF) {
                if (((type & SELF) && lf->lf_id != lock->lf_id) ||
                    ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
+                       /*
+                        * Locks belonging to one process are adjacent on the
+                        * list, so if we've found any locks belonging to us,
+                        * and we're now seeing something else, then we've
+                        * examined all "self" locks.  Note that bailing out
+                        * here is quite important; for coalescing, we assume
+                        * numerically adjacent locks from the same owner to
+                        * be adjacent on the list.
+                        */
+                       if ((type & SELF) && found_self) {
+                               return OVERLAP_NONE;
+                       }
+
                        *prev = &lf->lf_next;
                        *overlap = lf = lf->lf_next;
                        continue;
                }
-#ifdef LOCKF_DEBUG
-               if (lockf_debug & 2)
+
+               if ((type & SELF)) {
+                       found_self = 1;
+               }
+
+#ifdef LOCKF_DEBUGGING
+               if (LOCKF_DEBUGP(LF_DBG_LIST)) {
                        lf_print("\tchecking", lf);
-#endif /* LOCKF_DEBUG */
+               }
+#endif /* LOCKF_DEBUGGING */
                /*
                 * OK, check for overlap
-                *
-                * Six cases:
-                *      0) no overlap
-                *      1) overlap == lock
-                *      2) overlap contains lock
-                *      3) lock contains overlap
-                *      4) overlap starts before lock
-                *      5) overlap ends after lock
                 */
                if ((lf->lf_end != -1 && start > lf->lf_end) ||
                    (end != -1 && lf->lf_start > end)) {
                        /* Case 0 */
-#ifdef LOCKF_DEBUG
-                       if (lockf_debug & 2)
-                               printf("no overlap\n");
-#endif /* LOCKF_DEBUG */
-                       if ((type & SELF) && end != -1 && lf->lf_start > end)
-                               return (0);
+                       LOCKF_DEBUG(LF_DBG_LIST, "no overlap\n");
+
+                       /*
+                        * NOTE: assumes that locks for the same process are
+                        * nonintersecting and ordered.
+                        */
+                       if ((type & SELF) && end != -1 && lf->lf_start > end) {
+                               return OVERLAP_NONE;
+                       }
                        *prev = &lf->lf_next;
                        *overlap = lf = lf->lf_next;
                        continue;
                }
                if ((lf->lf_start == start) && (lf->lf_end == end)) {
-                       /* Case 1 */
-#ifdef LOCKF_DEBUG
-                       if (lockf_debug & 2)
-                               printf("overlap == lock\n");
-#endif /* LOCKF_DEBUG */
-                       return (1);
+                       LOCKF_DEBUG(LF_DBG_LIST, "overlap == lock\n");
+                       return OVERLAP_EQUALS_LOCK;
                }
                if ((lf->lf_start <= start) &&
                    (end != -1) &&
                    ((lf->lf_end >= end) || (lf->lf_end == -1))) {
-                       /* Case 2 */
-#ifdef LOCKF_DEBUG
-                       if (lockf_debug & 2)
-                               printf("overlap contains lock\n");
-#endif /* LOCKF_DEBUG */
-                       return (2);
+                       LOCKF_DEBUG(LF_DBG_LIST, "overlap contains lock\n");
+                       return OVERLAP_CONTAINS_LOCK;
                }
                if (start <= lf->lf_start &&
-                          (end == -1 ||
-                          (lf->lf_end != -1 && end >= lf->lf_end))) {
-                       /* Case 3 */
-#ifdef LOCKF_DEBUG
-                       if (lockf_debug & 2)
-                               printf("lock contains overlap\n");
-#endif /* LOCKF_DEBUG */
-                       return (3);
+                   (end == -1 ||
+                   (lf->lf_end != -1 && end >= lf->lf_end))) {
+                       LOCKF_DEBUG(LF_DBG_LIST, "lock contains overlap\n");
+                       return OVERLAP_CONTAINED_BY_LOCK;
                }
                if ((lf->lf_start < start) &&
-                       ((lf->lf_end >= start) || (lf->lf_end == -1))) {
-                       /* Case 4 */
-#ifdef LOCKF_DEBUG
-                       if (lockf_debug & 2)
-                               printf("overlap starts before lock\n");
-#endif /* LOCKF_DEBUG */
-                       return (4);
+                   ((lf->lf_end >= start) || (lf->lf_end == -1))) {
+                       LOCKF_DEBUG(LF_DBG_LIST, "overlap starts before lock\n");
+                       return OVERLAP_STARTS_BEFORE_LOCK;
                }
                if ((lf->lf_start > start) &&
-                       (end != -1) &&
-                       ((lf->lf_end > end) || (lf->lf_end == -1))) {
-                       /* Case 5 */
-#ifdef LOCKF_DEBUG
-                       if (lockf_debug & 2)
-                               printf("overlap ends after lock\n");
-#endif /* LOCKF_DEBUG */
-                       return (5);
+                   (end != -1) &&
+                   ((lf->lf_end > end) || (lf->lf_end == -1))) {
+                       LOCKF_DEBUG(LF_DBG_LIST, "overlap ends after lock\n");
+                       return OVERLAP_ENDS_AFTER_LOCK;
                }
                panic("lf_findoverlap: default");
        }
-       return (0);
+       return OVERLAP_NONE;
 }
 
+
 /*
- * Split a lock and a contained region into
- * two or three locks as necessary.
+ * lf_split
+ *
+ * Description:        Split a lock and a contained region into two or three locks
+ *             as necessary.
+ *
+ * Parameters: lock1                   Lock to split
+ *             lock2                   Overlapping lock region requiring the
+ *                                     split (upgrade/downgrade/unlock)
+ *
+ * Returns:    0                       Success
+ *             ENOLCK                  No memory for new lock
+ *
+ * Implicit Returns:
+ *             *lock1                  Modified original lock
+ *             *lock2                  Overlapping lock (inserted into list)
+ *             (new lock)              Potential new lock inserted into list
+ *                                     if split results in 3 locks
+ *
+ * Notes:      This operation can only fail if the split would result in three
+ *             locks, and there is insufficient memory to allocate the third
+ *             lock; in that case, neither of the locks will be modified.
  */
-static void
-lf_split(lock1, lock2)
-       struct lockf *lock1;
-       struct lockf *lock2;
+static int
+lf_split(struct lockf *lock1, struct lockf *lock2)
 {
        struct lockf *splitlock;
 
-#ifdef LOCKF_DEBUG
-       if (lockf_debug & 2) {
+#ifdef LOCKF_DEBUGGING
+       if (LOCKF_DEBUGP(LF_DBG_LIST)) {
                lf_print("lf_split", lock1);
                lf_print("splitting from", lock2);
        }
-#endif /* LOCKF_DEBUG */
+#endif /* LOCKF_DEBUGGING */
        /*
-        * Check to see if spliting into only two pieces.
+        * Check to see if splitting into only two pieces.
         */
        if (lock1->lf_start == lock2->lf_start) {
                lock1->lf_start = lock2->lf_end + 1;
                lock2->lf_next = lock1;
-               return;
+               return 0;
        }
        if (lock1->lf_end == lock2->lf_end) {
                lock1->lf_end = lock2->lf_start - 1;
                lock2->lf_next = lock1->lf_next;
                lock1->lf_next = lock2;
-               return;
+               return 0;
        }
        /*
         * Make a new lock consisting of the last part of
         * the encompassing lock
         */
        MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
+       if (splitlock == NULL) {
+               return ENOLCK;
+       }
        bcopy(lock1, splitlock, sizeof *splitlock);
        splitlock->lf_start = lock2->lf_end + 1;
        TAILQ_INIT(&splitlock->lf_blkhd);
@@ -782,104 +1365,334 @@ lf_split(lock1, lock2)
        splitlock->lf_next = lock1->lf_next;
        lock2->lf_next = splitlock;
        lock1->lf_next = lock2;
+
+       return 0;
 }
 
+
 /*
- * Wakeup a blocklist
+ * lf_wakelock
+ *
+ * Wakeup a blocklist in the case of a downgrade or unlock, since others
+ * waiting on the lock may now be able to acquire it.
+ *
+ * Parameters: listhead                Lock list head on which waiters may
+ *                                     have pending locks
+ *
+ * Returns:    <void>
+ *
+ * Notes:      This function iterates a list of locks and wakes all waiters,
+ *             rather than only waiters for the contended regions.  Because
+ *             of this, for heavily contended files, this can result in a
+ *             "thundering herd" situation.  Refactoring the code could make
+ *             this operation more efficient, if heavy contention ever results
+ *             in a real-world performance problem.
  */
 static void
-lf_wakelock(listhead)
-       struct lockf *listhead;
+lf_wakelock(struct lockf *listhead, boolean_t force_all)
 {
        struct lockf *wakelock;
+       boolean_t wake_all = TRUE;
+
+       if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE)) {
+               wake_all = FALSE;
+       }
 
        while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
                wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
                TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
+
                wakelock->lf_next = NOLOCKF;
-#ifdef LOCKF_DEBUG
-               if (lockf_debug & 2)
+#ifdef LOCKF_DEBUGGING
+               if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
                        lf_print("lf_wakelock: awakening", wakelock);
-#endif /* LOCKF_DEBUG */
+               }
+#endif /* LOCKF_DEBUGGING */
+               if (wake_all == FALSE) {
+                       /*
+                        * If there are items on the list head block list,
+                        * move them to the wakelock list instead, and then
+                        * correct their lf_next pointers.
+                        */
+                       if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
+                               TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
+
+                               struct lockf *tlock;
+
+                               TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
+                                       if (TAILQ_NEXT(tlock, lf_block) == tlock) {
+                                               /* See rdar://10887303 */
+                                               panic("cycle in wakelock list");
+                                       }
+                                       tlock->lf_next = wakelock;
+                               }
+                       }
+               }
                wakeup(wakelock);
+
+               if (wake_all == FALSE) {
+                       break;
+               }
        }
 }
 
-#ifdef LOCKF_DEBUG
+
+#ifdef LOCKF_DEBUGGING
+#define GET_LF_OWNER_PID(lf)    (proc_pid((lf)->lf_owner))
+
 /*
- * Print out a lock.
+ * lf_print DEBUG
+ *
+ * Print out a lock; lock information is prefixed by the string in 'tag'
+ *
+ * Parameters: tag                     A string tag for debugging
+ *             lock                    The lock whose information should be
+ *                                     displayed
+ *
+ * Returns:    <void>
  */
 void
-lf_print(tag, lock)
-       char *tag;
-       struct lockf *lock;
+lf_print(const char *tag, struct lockf *lock)
 {
-
        printf("%s: lock %p for ", tag, (void *)lock);
-       if (lock->lf_flags & F_POSIX)
-               printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
-       else
+       if (lock->lf_flags & F_POSIX) {
+               printf("proc %p (owner %d)",
+                   lock->lf_id, GET_LF_OWNER_PID(lock));
+       } else if (lock->lf_flags & F_OFD_LOCK) {
+               printf("fg %p (owner %d)",
+                   lock->lf_id, GET_LF_OWNER_PID(lock));
+       } else {
                printf("id %p", (void *)lock->lf_id);
-       if (lock->lf_vnode != 0)
-               printf(" in vno 0x%08x, %s, start %jd, end %jd",
+       }
+       if (lock->lf_vnode != 0) {
+               printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
                    lock->lf_vnode,
                    lock->lf_type == F_RDLCK ? "shared" :
                    lock->lf_type == F_WRLCK ? "exclusive" :
                    lock->lf_type == F_UNLCK ? "unlock" : "unknown",
                    (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
-       else
-               printf(" %s, start %jd, end %jd",
+       } else {
+               printf(" %s, start 0x%016llx, end 0x%016llx",
                    lock->lf_type == F_RDLCK ? "shared" :
                    lock->lf_type == F_WRLCK ? "exclusive" :
                    lock->lf_type == F_UNLCK ? "unlock" : "unknown",
                    (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
-       if (!TAILQ_EMPTY(&lock->lf_blkhd))
+       }
+       if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
                printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
-       else
+       } else {
                printf("\n");
+       }
 }
 
+
+/*
+ * lf_printlist DEBUG
+ *
+ * Print out a lock list for the vnode associated with 'lock'; lock information
+ * is prefixed by the string in 'tag'
+ *
+ * Parameters: tag                     A string tag for debugging
+ *             lock                    The lock whose vnode's lock list should
+ *                                     be displayed
+ *
+ * Returns:    <void>
+ */
 void
-lf_printlist(tag, lock)
-       char *tag;
-       struct lockf *lock;
+lf_printlist(const char *tag, struct lockf *lock)
 {
        struct lockf *lf, *blk;
 
-       if (lock->lf_vnode == 0)
+       if (lock->lf_vnode == 0) {
                return;
+       }
 
-       printf("%s: Lock list for vno 0x%08x:\n",
+       printf("%s: Lock list for vno %p:\n",
            tag, lock->lf_vnode);
        for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
-               printf("\tlock %p for ",(void *)lf);
-               if (lf->lf_flags & F_POSIX)
-                       printf("proc %ld",
-                           (long)((struct proc *)lf->lf_id)->p_pid);
-               else
+               printf("\tlock %p for ", (void *)lf);
+               if (lf->lf_flags & F_POSIX) {
+                       printf("proc %p (owner %d)",
+                           lf->lf_id, GET_LF_OWNER_PID(lf));
+               } else if (lf->lf_flags & F_OFD_LOCK) {
+                       printf("fg %p (owner %d)",
+                           lf->lf_id, GET_LF_OWNER_PID(lf));
+               } else {
                        printf("id %p", (void *)lf->lf_id);
-               printf(", %s, start %jd, end %jd",
+               }
+               printf(", %s, start 0x%016llx, end 0x%016llx",
                    lf->lf_type == F_RDLCK ? "shared" :
                    lf->lf_type == F_WRLCK ? "exclusive" :
                    lf->lf_type == F_UNLCK ? "unlock" :
                    "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
                TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
                        printf("\n\t\tlock request %p for ", (void *)blk);
-                       if (blk->lf_flags & F_POSIX)
-                               printf("proc %ld",
-                                   (long)((struct proc *)blk->lf_id)->p_pid);
-                       else
+                       if (blk->lf_flags & F_POSIX) {
+                               printf("proc %p (owner %d)",
+                                   blk->lf_id, GET_LF_OWNER_PID(blk));
+                       } else if (blk->lf_flags & F_OFD_LOCK) {
+                               printf("fg %p (owner %d)",
+                                   blk->lf_id, GET_LF_OWNER_PID(blk));
+                       } else {
                                printf("id %p", (void *)blk->lf_id);
-                       printf(", %s, start %jd, end %jd",
+                       }
+                       printf(", %s, start 0x%016llx, end 0x%016llx",
                            blk->lf_type == F_RDLCK ? "shared" :
                            blk->lf_type == F_WRLCK ? "exclusive" :
                            blk->lf_type == F_UNLCK ? "unlock" :
                            "unknown", (intmax_t)blk->lf_start,
                            (intmax_t)blk->lf_end);
-                       if (!TAILQ_EMPTY(&blk->lf_blkhd))
+                       if (!TAILQ_EMPTY(&blk->lf_blkhd)) {
                                panic("lf_printlist: bad list");
+                       }
                }
                printf("\n");
        }
 }
-#endif /* LOCKF_DEBUG */
+#endif /* LOCKF_DEBUGGING */
+
+#if IMPORTANCE_INHERITANCE
+
+/*
+ * lf_hold_assertion
+ *
+ * Call task importance hold assertion on the owner of the lock.
+ *
+ * Parameters: block_task               Owner of the lock blocking
+ *                                      current thread.
+ *
+ *             block                    lock on which the current thread
+ *                                      is blocking on.
+ *
+ * Returns:    <void>
+ *
+ * Notes: The task reference on block_task is not needed to be hold since
+ *        the current thread has vnode lock and block_task has a file
+ *        lock, thus removing file lock in exit requires block_task to
+ *        grab the vnode lock.
+ */
+static void
+lf_hold_assertion(task_t block_task, struct lockf *block)
+{
+       if (task_importance_hold_file_lock_assertion(block_task, 1) == 0) {
+               block->lf_boosted = LF_BOOSTED;
+               LOCKF_DEBUG(LF_DBG_IMPINH,
+                   "lf: importance hold file lock assert on pid %d lock %p\n",
+                   proc_pid(block->lf_owner), block);
+       }
+}
+
+
+/*
+ * lf_jump_to_queue_head
+ *
+ * Jump the lock from the tail of the block queue to the head of
+ * the queue.
+ *
+ * Parameters: block                    lockf struct containing the
+ *                                      block queue.
+ *             lock                     lockf struct to be jumped to the
+ *                                      front.
+ *
+ * Returns:    <void>
+ */
+static void
+lf_jump_to_queue_head(struct lockf *block, struct lockf *lock)
+{
+       /* Move the lock to the head of the block queue. */
+       TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
+       TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
+}
+
+
+/*
+ * lf_drop_assertion
+ *
+ * Drops the task hold assertion.
+ *
+ * Parameters: block                    lockf struct holding the assertion.
+ *
+ * Returns:    <void>
+ */
+static void
+lf_drop_assertion(struct lockf *block)
+{
+       LOCKF_DEBUG(LF_DBG_IMPINH, "lf: %d: dropping assertion for lock %p\n",
+           proc_pid(block->lf_owner), block);
+
+       task_t current_task = proc_task(block->lf_owner);
+       task_importance_drop_file_lock_assertion(current_task, 1);
+       block->lf_boosted = LF_NOT_BOOSTED;
+}
+
+/*
+ * lf_adjust_assertion
+ *
+ * Adjusts importance assertion of file lock. Goes through
+ * all the blocking locks and checks if the file lock needs
+ * to be boosted anymore.
+ *
+ * Parameters: block   lockf structure which needs to be adjusted.
+ *
+ * Returns:    <void>
+ */
+static void
+lf_adjust_assertion(struct lockf *block)
+{
+       boolean_t drop_boost = TRUE;
+       struct lockf *next;
+
+       /* Return if the lock is not boosted */
+       if (block->lf_boosted == LF_NOT_BOOSTED) {
+               return;
+       }
+
+       TAILQ_FOREACH(next, &block->lf_blkhd, lf_block) {
+               /* Check if block and next are same type of locks */
+               if (((block->lf_flags & next->lf_flags & F_POSIX) != 0) ||
+                   ((block->lf_flags & next->lf_flags & F_OFD_LOCK) &&
+                   (block->lf_owner != next->lf_owner) &&
+                   (NULL != block->lf_owner && NULL != next->lf_owner))) {
+                       /* Check if next would be boosting block */
+                       if (task_is_importance_donor(proc_task(next->lf_owner)) &&
+                           task_is_importance_receiver_type(proc_task(block->lf_owner))) {
+                               /* Found a lock boosting block */
+                               drop_boost = FALSE;
+                               break;
+                       }
+               }
+       }
+
+       if (drop_boost) {
+               lf_drop_assertion(block);
+       }
+}
+
+static void
+lf_boost_blocking_proc(struct lockf *lock, struct lockf *block)
+{
+       task_t ltask = proc_task(lock->lf_owner);
+       task_t btask = proc_task(block->lf_owner);
+
+       /*
+        * Check if ltask can donate importance. The
+        * check of imp_donor bit is done without holding
+        * any lock. The value may change after you read it,
+        * but it is ok to boost a task while someone else is
+        * unboosting you.
+        *
+        * TODO: Support live inheritance on file locks.
+        */
+       if (task_is_importance_donor(ltask)) {
+               LOCKF_DEBUG(LF_DBG_IMPINH,
+                   "lf: %d: attempt to boost pid %d that holds lock %p\n",
+                   proc_pid(lock->lf_owner), proc_pid(block->lf_owner), block);
+
+               if (block->lf_boosted != LF_BOOSTED &&
+                   task_is_importance_receiver_type(btask)) {
+                       lf_hold_assertion(btask, block);
+               }
+               lf_jump_to_queue_head(block, lock);
+       }
+}
+#endif /* IMPORTANCE_INHERITANCE */