#include <sys/malloc.h>
#include <sys/fcntl.h>
#include <sys/lockf.h>
+#include <sys/sdt.h>
+#include <kern/task.h>
/*
* This variable controls the maximum number of processes that will
void lf_print(const char *tag, struct lockf *lock);
void lf_printlist(const char *tag, struct lockf *lock);
static int lockf_debug = 2;
-SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
+SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
/*
* If there is no mask bit selector, or there is on, and the selector is
static int lf_clearlock(struct lockf *);
static overlap_t lf_findoverlap(struct lockf *,
struct lockf *, int, struct lockf ***, struct lockf **);
-static struct lockf *lf_getblock(struct lockf *);
-static int lf_getlock(struct lockf *, struct flock *);
-static int lf_setlock(struct lockf *);
+static struct lockf *lf_getblock(struct lockf *, pid_t);
+static int lf_getlock(struct lockf *, struct flock *, pid_t);
+static int lf_setlock(struct lockf *, struct timespec *);
static int lf_split(struct lockf *, struct lockf *);
static void lf_wakelock(struct lockf *, boolean_t);
-
-
-/*
- * in order to mitigate risk
- * don't switch to new wake-one method unless
- * we have at least this many waiters to wake up
- */
-#define SAFE_WAITER_LIMIT 20
-
+#if IMPORTANCE_INHERITANCE
+static void lf_hold_assertion(task_t, struct lockf *);
+static void lf_jump_to_queue_head(struct lockf *, struct lockf *);
+static void lf_drop_assertion(struct lockf *);
+#endif /* IMPORTANCE_INHERITANCE */
/*
* lf_advlock
* lf_setlock:EDEADLK
* lf_setlock:EINTR
* lf_setlock:ENOLCK
+ * lf_setlock:ETIMEDOUT
* lf_clearlock:ENOLCK
* vnode_size:???
*
lock->lf_type = fl->l_type;
lock->lf_head = head;
lock->lf_next = (struct lockf *)0;
- lock->lf_waiters = 0;
TAILQ_INIT(&lock->lf_blkhd);
lock->lf_flags = ap->a_flags;
+#if IMPORTANCE_INHERITANCE
+ lock->lf_boosted = LF_NOT_BOOSTED;
+#endif /* IMPORTANCE_INHERITANCE */
if (ap->a_flags & F_FLOCK)
lock->lf_flags |= F_WAKE1_SAFE;
*/
switch(ap->a_op) {
case F_SETLK:
- error = lf_setlock(lock);
+ error = lf_setlock(lock, ap->a_timeout);
break;
case F_UNLCK:
break;
case F_GETLK:
- error = lf_getlock(lock, fl);
+ error = lf_getlock(lock, fl, -1);
FREE(lock, M_LOCKF);
break;
+
default:
FREE(lock, M_LOCKF);
error = EINVAL;
break;
}
- lck_mtx_unlock(&vp->v_lock); /* done maniplulating the list */
+ lck_mtx_unlock(&vp->v_lock); /* done manipulating the list */
LOCKF_DEBUG(0, "lf_advlock: normal exit: %d\n\n", error);
return (error);
}
+/*
+ * Empty the queue of msleeping requests for a lock on the given vnode.
+ * Called with the vnode already locked. Used for forced unmount, where
+ * a flock(2) invoker sleeping on a blocked lock holds an iocount reference
+ * that prevents the vnode from ever being drained. Force unmounting wins.
+ */
+void
+lf_abort_advlocks(vnode_t vp)
+{
+ struct lockf *lock;
+
+ if ((lock = vp->v_lockf) == NULL)
+ return;
+
+ lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
+
+ if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
+ struct lockf *tlock;
+
+ TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
+ /*
+ * Setting this flag should cause all
+ * currently blocked F_SETLK request to
+ * return to userland with an errno.
+ */
+ tlock->lf_flags |= F_ABORT;
+ }
+ lf_wakelock(lock, TRUE);
+ }
+}
+
+/*
+ * Take any lock attempts which are currently blocked by a given lock ("from")
+ * and mark them as blocked by a different lock ("to"). Used in the case
+ * where a byte range currently occupied by "from" is to be occupied by "to."
+ */
+static void
+lf_move_blocked(struct lockf *to, struct lockf *from)
+{
+ struct lockf *tlock;
+
+ TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
+ tlock->lf_next = to;
+ }
+
+ TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
+}
/*
- * lf_coelesce_adjacent
+ * lf_coalesce_adjacent
*
- * Description: Helper function: when setting a lock, coelesce adjacent
+ * Description: Helper function: when setting a lock, coalesce adjacent
* locks. Needed because adjacent locks are not overlapping,
- * but POSIX requires that they be coelesced.
+ * but POSIX requires that they be coalesced.
*
* Parameters: lock The new lock which may be adjacent
- * to already locked reagions, and which
- * should therefore be coelesced with them
+ * to already locked regions, and which
+ * should therefore be coalesced with them
*
* Returns: <void>
*/
static void
-lf_coelesce_adjacent(struct lockf *lock)
+lf_coalesce_adjacent(struct lockf *lock)
{
struct lockf **lf = lock->lf_head;
while (*lf != NOLOCKF) {
- /* reject locks that obviously could not be coelesced */
+ /* reject locks that obviously could not be coalesced */
if ((*lf == lock) ||
((*lf)->lf_id != lock->lf_id) ||
((*lf)->lf_type != lock->lf_type)) {
continue;
}
- /* If the lock ends adjacent to us, we can coelesce it */
+ /*
+ * NOTE: Assumes that if two locks are adjacent on the number line
+ * and belong to the same owner, then they are adjacent on the list.
+ */
if ((*lf)->lf_end != -1 &&
((*lf)->lf_end + 1) == lock->lf_start) {
struct lockf *adjacent = *lf;
- LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent previous\n");
+ LOCKF_DEBUG(0, "lf_coalesce_adjacent: coalesce adjacent previous\n");
lock->lf_start = (*lf)->lf_start;
*lf = lock;
lf = &(*lf)->lf_next;
+
+ lf_move_blocked(lock, adjacent);
+
FREE(adjacent, M_LOCKF);
continue;
}
- /* If the lock starts adjacent to us, we can coelesce it */
+ /* If the lock starts adjacent to us, we can coalesce it */
if (lock->lf_end != -1 &&
(lock->lf_end + 1) == (*lf)->lf_start) {
struct lockf *adjacent = *lf;
- LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent following\n");
+ LOCKF_DEBUG(0, "lf_coalesce_adjacent: coalesce adjacent following\n");
lock->lf_end = (*lf)->lf_end;
lock->lf_next = (*lf)->lf_next;
lf = &lock->lf_next;
+
+ lf_move_blocked(lock, adjacent);
+
FREE(adjacent, M_LOCKF);
continue;
}
* the set is successful, and freed if the
* set is unsuccessful.
*
+ * timeout Timeout specified in the case of
+ * SETLKWTIMEOUT.
+ *
* Returns: 0 Success
* EAGAIN
* EDEADLK
* lf_split:ENOLCK
* lf_clearlock:ENOLCK
* msleep:EINTR
+ * msleep:ETIMEDOUT
*
* Notes: We add the lock to the provisional lock list. We do not
- * coelesce at this time; this has implications for other lock
+ * coalesce at this time; this has implications for other lock
* requestors in the blocker search mechanism.
*/
static int
-lf_setlock(struct lockf *lock)
+lf_setlock(struct lockf *lock, struct timespec *timeout)
{
struct lockf *block;
struct lockf **head = lock->lf_head;
int priority, needtolink, error;
struct vnode *vp = lock->lf_vnode;
overlap_t ovcase;
+#if IMPORTANCE_INHERITANCE
+ task_t boosting_task, block_task;
+#endif /* IMPORTANCE_INHERITANCE */
#ifdef LOCKF_DEBUGGING
if (lockf_debug & 1) {
/*
* Scan lock list for this file looking for locks that would block us.
*/
- while ((block = lf_getblock(lock))) {
+ while ((block = lf_getblock(lock, -1))) {
/*
* Free the structure and return if nonblocking.
*/
if ((lock->lf_flags & F_WAIT) == 0) {
+ DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
FREE(lock, M_LOCKF);
return (EAGAIN);
}
*/
lock->lf_next = block;
TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
- block->lf_waiters++;
if ( !(lock->lf_flags & F_FLOCK))
block->lf_flags &= ~F_WAKE1_SAFE;
lf_printlist("lf_setlock(block)", block);
}
#endif /* LOCKF_DEBUGGING */
- error = msleep(lock, &vp->v_lock, priority, lockstr, 0);
-
- if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
- struct lockf *tlock;
-
- if ((block = lf_getblock(lock))) {
- TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
- tlock->lf_next = block;
+ DTRACE_FSINFO(advlock__wait, vnode_t, vp);
+#if IMPORTANCE_INHERITANCE
+ /*
+ * Posix type of locks are not inherited by child processes and
+ * it maintains one to one mapping between lock and its owner, while
+ * Flock type of locks are inherited across forks and it does not
+ * maintian any one to one mapping between the lock and the lock
+ * owner. Thus importance donation is done only for Posix type of
+ * locks.
+ */
+ if ((lock->lf_flags & F_POSIX) && (block->lf_flags & F_POSIX)) {
+ block_task = proc_task((proc_t) block->lf_id);
+ boosting_task = proc_task((proc_t) lock->lf_id);
+
+ /* Check if current task can donate importance. The
+ * check of imp_donor bit is done without holding
+ * any lock. The value may change after you read it,
+ * but it is ok to boost a task while someone else is
+ * unboosting you.
+ *
+ * TODO: Support live inheritance on file locks.
+ */
+ if (task_is_importance_donor(boosting_task)) {
+ if (block->lf_boosted != LF_BOOSTED &&
+ task_is_importance_receiver_type(block_task)) {
+ lf_hold_assertion(block_task, block);
}
- TAILQ_CONCAT(&block->lf_blkhd, &lock->lf_blkhd, lf_block);
-
- block->lf_waiters += lock->lf_waiters;
- lock->lf_waiters = 0;
+ lf_jump_to_queue_head(block, lock);
}
}
- if (error) { /* XXX */
+#endif /* IMPORTANCE_INHERITANCE */
+ error = msleep(lock, &vp->v_lock, priority, lockstr, timeout);
+
+ if (error == 0 && (lock->lf_flags & F_ABORT) != 0)
+ error = EBADF;
+
+ if (lock->lf_next) {
/*
- * We may have been awakened by a signal and/or by a
- * debugger continuing us (in which cases we must remove
- * ourselves from the blocked list) and/or by another
- * process releasing a lock (in which case we have
- * already been removed from the blocked list and our
- * lf_next field set to NOLOCKF).
+ * lf_wakelock() always sets wakelock->lf_next to
+ * NULL before a wakeup; so we've been woken early
+ * - perhaps by a debugger, signal or other event.
+ *
+ * Remove 'lock' from the block list (avoids double-add
+ * in the spurious case, which would create a cycle)
*/
- if (lock->lf_next) {
- TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
- lock->lf_next->lf_waiters--;
- lock->lf_next = NOLOCKF;
+ TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
+ lock->lf_next = NULL;
+
+ if (error == 0) {
+ /*
+ * If this was a spurious wakeup, retry
+ */
+ printf("%s: spurious wakeup, retrying lock\n",
+ __func__);
+ continue;
}
+ }
+
+ if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
+ if ((block = lf_getblock(lock, -1)) != NULL)
+ lf_move_blocked(block, lock);
+ }
+
+ if (error) {
if (!TAILQ_EMPTY(&lock->lf_blkhd))
lf_wakelock(lock, TRUE);
-
FREE(lock, M_LOCKF);
+ /* Return ETIMEDOUT if timeout occoured. */
+ if (error == EWOULDBLOCK) {
+ error = ETIMEDOUT;
+ }
return (error);
- } /* XXX */
+ }
}
+
/*
* No blocks!! Add the lock. Note that we will
* downgrade or upgrade any overlapping locks this
lf_wakelock(overlap, TRUE);
overlap->lf_type = lock->lf_type;
FREE(lock, M_LOCKF);
- lock = overlap; /* for lf_coelesce_adjacent() */
+ lock = overlap; /* for lf_coalesce_adjacent() */
break;
case OVERLAP_CONTAINS_LOCK:
*/
if (overlap->lf_type == lock->lf_type) {
FREE(lock, M_LOCKF);
- lock = overlap; /* for lf_coelesce_adjacent() */
+ lock = overlap; /* for lf_coalesce_adjacent() */
break;
}
if (overlap->lf_start == lock->lf_start) {
ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
lf_block);
- overlap->lf_waiters--;
-
TAILQ_INSERT_TAIL(&lock->lf_blkhd,
ltmp, lf_block);
- lock->lf_waiters++;
-
ltmp->lf_next = lock;
}
}
}
break;
}
- /* Coelesce adjacent locks with identical attributes */
- lf_coelesce_adjacent(lock);
+ /* Coalesce adjacent locks with identical attributes */
+ lf_coalesce_adjacent(lock);
#ifdef LOCKF_DEBUGGING
if (lockf_debug & 1) {
lf_print("lf_setlock: got the lock", lock);
* Wakeup the list of locks to be retried.
*/
lf_wakelock(overlap, FALSE);
+#if IMPORTANCE_INHERITANCE
+ if (overlap->lf_boosted == LF_BOOSTED) {
+ lf_drop_assertion(overlap);
+ }
+#endif /* IMPORTANCE_INHERITANCE */
switch (ovcase) {
case OVERLAP_NONE: /* satisfy compiler enum/switch */
* fl Pointer to flock structure to receive
* the blocking lock information, if a
* blocking lock is found.
+ * matchpid -1, or pid value to match in lookup.
*
* Returns: 0 Success
*
* the blocking process ID for advisory record locks.
*/
static int
-lf_getlock(struct lockf *lock, struct flock *fl)
+lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
{
struct lockf *block;
lf_print("lf_getlock", lock);
#endif /* LOCKF_DEBUGGING */
- if ((block = lf_getblock(lock))) {
+ if ((block = lf_getblock(lock, matchpid))) {
fl->l_type = block->lf_type;
fl->l_whence = SEEK_SET;
fl->l_start = block->lf_start;
return (0);
}
-
/*
* lf_getblock
*
*
* Parameters: lock The lock for which we are interested
* in obtaining the blocking lock, if any
+ * matchpid -1, or pid value to match in lookup.
*
* Returns: NOLOCKF No blocking lock exists
* !NOLOCKF The address of the blocking lock's
* struct lockf.
*/
static struct lockf *
-lf_getblock(struct lockf *lock)
+lf_getblock(struct lockf *lock, pid_t matchpid)
{
struct lockf **prev, *overlap, *lf = *(lock->lf_head);
- int ovcase;
- prev = lock->lf_head;
- while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) != OVERLAP_NONE) {
+ for (prev = lock->lf_head;
+ lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
+ lf = overlap->lf_next) {
/*
- * We've found an overlap, see if it blocks us
+ * Found an overlap.
+ *
+ * If we're matching pids, and it's a record lock,
+ * but the pid doesn't match, then keep on looking ..
*/
- if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
- return (overlap);
+ if (matchpid != -1 &&
+ (overlap->lf_flags & F_POSIX) != 0 &&
+ proc_pid((struct proc *)(overlap->lf_id)) != matchpid)
+ continue;
/*
- * Nope, point to the next one on the list and
- * see if it blocks us
+ * does it block us?
*/
- lf = overlap->lf_next;
+ if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
+ return (overlap);
}
return (NOLOCKF);
}
* this is generally used to relink the
* lock list, avoiding a second iteration.
* *overlap The pointer to the overlapping lock
- * itself; this is ussed to return data in
+ * itself; this is used to return data in
* the check == OTHERS case, and for the
* caller to modify the overlapping lock,
* in the check == SELF case
* while lf_setlock will iterate over all overlapping locks to
*
* The check parameter can be SELF, meaning we are looking for
- * overelapping locks owned by us, or it can be OTHERS, meaning
+ * overlapping locks owned by us, or it can be OTHERS, meaning
* we are looking for overlapping locks owned by someone else so
* we can report a blocking lock on an F_GETLK request.
*
struct lockf ***prev, struct lockf **overlap)
{
off_t start, end;
+ int found_self = 0;
*overlap = lf;
if (lf == NOLOCKF)
while (lf != NOLOCKF) {
if (((type & SELF) && lf->lf_id != lock->lf_id) ||
((type & OTHERS) && lf->lf_id == lock->lf_id)) {
+ /*
+ * Locks belonging to one process are adjacent on the
+ * list, so if we've found any locks belonging to us,
+ * and we're now seeing something else, then we've
+ * examined all "self" locks. Note that bailing out
+ * here is quite important; for coalescing, we assume
+ * numerically adjacent locks from the same owner to
+ * be adjacent on the list.
+ */
+ if ((type & SELF) && found_self) {
+ return OVERLAP_NONE;
+ }
+
*prev = &lf->lf_next;
*overlap = lf = lf->lf_next;
continue;
}
+
+ if ((type & SELF)) {
+ found_self = 1;
+ }
+
#ifdef LOCKF_DEBUGGING
if (lockf_debug & 2)
lf_print("\tchecking", lf);
(end != -1 && lf->lf_start > end)) {
/* Case 0 */
LOCKF_DEBUG(2, "no overlap\n");
+
+ /*
+ * NOTE: assumes that locks for the same process are
+ * nonintersecting and ordered.
+ */
if ((type & SELF) && end != -1 && lf->lf_start > end)
return (OVERLAP_NONE);
*prev = &lf->lf_next;
struct lockf *wakelock;
boolean_t wake_all = TRUE;
- if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE) && listhead->lf_waiters > SAFE_WAITER_LIMIT)
+ if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE))
wake_all = FALSE;
while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
- listhead->lf_waiters--;
wakelock->lf_next = NOLOCKF;
#ifdef LOCKF_DEBUGGING
lf_print("lf_wakelock: awakening", wakelock);
#endif /* LOCKF_DEBUGGING */
if (wake_all == FALSE) {
+ /*
+ * If there are items on the list head block list,
+ * move them to the wakelock list instead, and then
+ * correct their lf_next pointers.
+ */
+ if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
+ TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
- TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
- wakelock->lf_waiters = listhead->lf_waiters;
- listhead->lf_waiters = 0;
-
- if (!TAILQ_EMPTY(&wakelock->lf_blkhd)) {
struct lockf *tlock;
TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
+ if (TAILQ_NEXT(tlock, lf_block) == tlock) {
+ /* See rdar://10887303 */
+ panic("cycle in wakelock list");
+ }
tlock->lf_next = wakelock;
}
}
}
}
#endif /* LOCKF_DEBUGGING */
+
+#if IMPORTANCE_INHERITANCE
+
+/*
+ * lf_hold_assertion
+ *
+ * Call task importance hold assertion on the owner of the lock.
+ *
+ * Parameters: block_task Owner of the lock blocking
+ * current thread.
+ *
+ * block lock on which the current thread
+ * is blocking on.
+ *
+ * Returns: <void>
+ *
+ * Notes: The task reference on block_task is not needed to be hold since
+ * the current thread has vnode lock and block_task has a file
+ * lock, thus removing file lock in exit requires block_task to
+ * grab the vnode lock.
+ */
+static void
+lf_hold_assertion(task_t block_task, struct lockf *block)
+{
+ if (task_importance_hold_file_lock_assertion(block_task, 1)) {
+ block->lf_boosted = LF_BOOSTED;
+ }
+}
+
+
+/*
+ * lf_jump_to_queue_head
+ *
+ * Jump the lock from the tail of the block queue to the head of
+ * the queue.
+ *
+ * Parameters: block lockf struct containing the
+ * block queue.
+ * lock lockf struct to be jumped to the
+ * front.
+ *
+ * Returns: <void>
+ */
+static void
+lf_jump_to_queue_head(struct lockf *block, struct lockf *lock)
+{
+ /* Move the lock to the head of the block queue. */
+ TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
+ TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
+}
+
+
+/*
+ * lf_drop_assertion
+ *
+ * Drops the task hold assertion.
+ *
+ * Parameters: block lockf struct holding the assertion.
+ *
+ * Returns: <void>
+ */
+static void
+lf_drop_assertion(struct lockf *block)
+{
+ task_t current_task;
+
+ current_task = proc_task((proc_t) block->lf_id);
+ task_importance_drop_file_lock_assertion(current_task, 1);
+ block->lf_boosted = LF_NOT_BOOSTED;
+}
+
+#endif /* IMPORTANCE_INHERITANCE */