]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/nfs/nfs_lock.c
xnu-792.6.70.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_lock.c
index 4edfce39a9346a5802a125aed179e3b5b4d6c51f..762c140b0ca22dc32a946c411c4a2c50f869f69a 100644 (file)
@@ -1,24 +1,21 @@
 /*
- * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
- * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License").  You may not use this file except in compliance with the
+ * License.  Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
  * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
  * 
  * @APPLE_LICENSE_HEADER_END@
  */
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>                /* for hz */
-#include <sys/file.h>
-#include <sys/lock.h>
+#include <sys/file_internal.h>
 #include <sys/malloc.h>
 #include <sys/lockf.h>         /* for hz */ /* Must come after sys/malloc.h */
-#include <sys/mbuf.h>
-#include <sys/mount.h>
-#include <sys/namei.h>
-#include <sys/proc.h>
+#include <sys/kpi_mbuf.h>
+#include <sys/mount_internal.h>
+#include <sys/proc_internal.h> /* for p_start */
+#include <sys/kauth.h>
 #include <sys/resourcevar.h>
 #include <sys/socket.h>
-#include <sys/socket.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
-#include <sys/vnode.h>
+#include <sys/vnode_internal.h>
 
-#include <kern/thread_act.h>
+#include <kern/thread.h>
 
 #include <machine/limits.h>
 
 #include <nfs/nfsmount.h>
 #include <nfs/nfsnode.h>
 #include <nfs/nfs_lock.h>
-#include <nfs/nlminfo.h>
 
 #define OFF_MAX QUAD_MAX
 
-uint64_t nfsadvlocks = 0;
-struct timeval nfsadvlock_longest = {0, 0};
-struct timeval nfsadvlocks_time = {0, 0};
-
-pid_t nfslockdpid = 0;
-struct file *nfslockdfp = 0;
+/*
+ * globals for managing the lockd fifo
+ */
+vnode_t nfslockdvnode = 0;
 int nfslockdwaiting = 0;
-int nfslockdfifowritten = 0;
+time_t nfslockdstarttimeout = 0;
 int nfslockdfifolock = 0;
 #define NFSLOCKDFIFOLOCK_LOCKED        1
 #define NFSLOCKDFIFOLOCK_WANT  2
 
 /*
- * XXX
- * We have to let the process know if the call succeeded.  I'm using an extra
- * field in the uu_nlminfo field in the uthread structure, as it is already for
- * lockd stuff.
+ * pending lock request messages are kept in this queue which is
+ * kept sorted by transaction ID (xid).
+ */
+uint64_t nfs_lockxid = 0;
+LOCKD_MSG_QUEUE nfs_pendlockq;
+
+/*
+ * This structure is used to identify processes which have acquired NFS locks.
+ * Knowing which processes have ever acquired locks allows us to short-circuit
+ * unlock requests for processes that have never had an NFS file lock.  Thus
+ * avoiding a costly and unnecessary lockd request.
+ */
+struct nfs_lock_pid {
+       TAILQ_ENTRY(nfs_lock_pid)       lp_lru;         /* LRU list */
+       LIST_ENTRY(nfs_lock_pid)        lp_hash;        /* hash chain */
+       int                             lp_valid;       /* valid entry? */
+       int                             lp_time;        /* last time seen valid */
+       pid_t                           lp_pid;         /* The process ID. */
+       struct timeval                  lp_pid_start;   /* Start time of process id */
+};
+
+#define NFS_LOCK_PID_HASH_SIZE         64      // XXX tune me
+#define        NFS_LOCK_PID_HASH(pid)  \
+       (&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash])
+LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl;
+TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru;
+u_long nfs_lock_pid_hash;
+int nfs_lock_pid_lock;
+
+
+/*
+ * initialize global nfs lock state
+ */
+void
+nfs_lockinit(void)
+{
+       TAILQ_INIT(&nfs_pendlockq);
+       nfs_lock_pid_lock = 0;
+       nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE,
+                                        M_TEMP, &nfs_lock_pid_hash);
+       TAILQ_INIT(&nfs_lock_pid_lru);
+}
+
+/*
+ * insert a lock request message into the pending queue
+ */
+static inline void
+nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq)
+{
+       LOCKD_MSG_REQUEST *mr;
+
+       mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue);
+       if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
+               /* fast path: empty queue or new largest xid */
+               TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next);
+               return;
+       }
+       /* slow path: need to walk list to find insertion point */
+       while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
+               mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next);
+       }
+       if (mr) {
+               TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next);
+       } else {
+               TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next);
+       }
+}
+
+/*
+ * remove a lock request message from the pending queue
+ */
+static inline void
+nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq)
+{
+       TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next);
+}
+
+/*
+ * find a pending lock request message by xid
+ *
+ * We search from the head of the list assuming that the message we're
+ * looking for is for an older request (because we have an answer to it).
+ * This assumes that lock request will be answered primarily in FIFO order.
+ * However, this may not be the case if there are blocked requests.  We may
+ * want to move blocked requests to a separate queue (but that'll complicate
+ * duplicate xid checking).
+ */
+static inline LOCKD_MSG_REQUEST *
+nfs_lockdmsg_find_by_xid(uint64_t lockxid)
+{
+       LOCKD_MSG_REQUEST *mr;
+
+       TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
+               if (mr->lmr_msg.lm_xid == lockxid)
+                       return mr;
+               if (mr->lmr_msg.lm_xid > lockxid)
+                       return NULL;
+       }
+       return mr;
+}
+
+/*
+ * Because we can't depend on nlm_granted messages containing the same
+ * cookie we sent with the original lock request, we need code test if
+ * an nlm_granted answer matches the lock request.  We also need code
+ * that can find a lockd message based solely on the nlm_granted answer.
+ */
+
+/*
+ * compare lockd message to answer
+ *
+ * returns 0 on equality and 1 if different
+ */
+static inline int
+nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp)
+{
+       if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
+               return 1;
+       if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid)
+               return 1;
+       if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start)
+               return 1;
+       if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len)
+               return 1;
+       if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len)
+               return 1;
+       if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len))
+               return 1;
+       return 0;
+}
+
+/*
+ * find a pending lock request message based on the lock info provided
+ * in the lockd_ans/nlm_granted data.  We need this because we can't
+ * depend on nlm_granted messages containing the same cookie we sent
+ * with the original lock request.
+ *
+ * We search from the head of the list assuming that the message we're
+ * looking for is for an older request (because we have an answer to it).
+ * This assumes that lock request will be answered primarily in FIFO order.
+ * However, this may not be the case if there are blocked requests.  We may
+ * want to move blocked requests to a separate queue (but that'll complicate
+ * duplicate xid checking).
+ */
+static inline LOCKD_MSG_REQUEST *
+nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp)
+{
+       LOCKD_MSG_REQUEST *mr;
+
+       if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
+               return NULL;
+       TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
+               if (!nfs_lockdmsg_compare_to_answer(mr, ansp))
+                       break;
+       }
+       return mr;
+}
+
+/*
+ * return the next unique lock request transaction ID
  */
+static inline uint64_t
+nfs_lockxid_get(void)
+{
+       LOCKD_MSG_REQUEST *mr;
+
+       /* derive initial lock xid from system time */
+       if (!nfs_lockxid) {
+               /*
+                * Note: it's OK if this code inits nfs_lockxid to 0 (for example,
+                * due to a broken clock) because we immediately increment it
+                * and we guarantee to never use xid 0.  So, nfs_lockxid should only
+                * ever be 0 the first time this function is called.
+                */
+               struct timeval tv;
+               microtime(&tv);
+               nfs_lockxid = (uint64_t)tv.tv_sec << 12;
+       }
+
+       /* make sure we get a unique xid */
+       do {
+               /* Skip zero xid if it should ever happen.  */
+               if (++nfs_lockxid == 0)
+                       nfs_lockxid++;
+               if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) ||
+                    (mr->lmr_msg.lm_xid < nfs_lockxid)) {
+                       /* fast path: empty queue or new largest xid */
+                       break;
+               }
+               /* check if xid is already in use */
+       } while (nfs_lockdmsg_find_by_xid(nfs_lockxid));
+
+       return nfs_lockxid;
+}
+
+
+/*
+ * Check the nfs_lock_pid hash table for an entry and, if requested,
+ * add the entry if it is not found.
+ *
+ * (Also, if adding, try to clean up some stale entries.)
+ */
+static int
+nfs_lock_pid_check(proc_t p, int addflag, vnode_t vp)
+{
+       struct nfs_lock_pid *lp, *lplru, *lplru_next;
+       proc_t plru;
+       int error = 0;
+       struct timeval now;
+
+       /* lock hash */
+loop:
+       if (nfs_lock_pid_lock) {
+               struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
+               while (nfs_lock_pid_lock) {
+                       nfs_lock_pid_lock = -1;
+                       tsleep(&nfs_lock_pid_lock, PCATCH, "nfslockpid", 0);
+                       if ((error = nfs_sigintr(nmp, NULL, p)))
+                               return (error);
+               }
+               goto loop;
+       }
+       nfs_lock_pid_lock = 1;
+
+       /* Search hash chain */
+       error = ENOENT;
+       lp = NFS_LOCK_PID_HASH(proc_pid(p))->lh_first;
+       for (; lp != NULL; lp = lp->lp_hash.le_next)
+               if (lp->lp_pid == proc_pid(p)) {
+                       /* found pid... */
+                       if (timevalcmp(&lp->lp_pid_start, &p->p_stats->p_start, ==)) {
+                               /* ...and it's valid */
+                               /* move to tail of LRU */
+                               TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
+                               microuptime(&now);
+                               lp->lp_time = now.tv_sec;
+                               TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
+                               error = 0;
+                               break;
+                       }
+                       /* ...but it's no longer valid */
+                       /* remove from hash, invalidate, and move to lru head */
+                       LIST_REMOVE(lp, lp_hash);
+                       lp->lp_valid = 0;
+                       TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
+                       TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru);
+                       lp = NULL;
+                       break;
+               }
+
+       /* if we didn't find it (valid) and we've been asked to add it */
+       if ((error == ENOENT) && addflag) {
+               /* scan lru list for invalid, stale entries to reuse/free */
+               int lrucnt = 0;
+               microuptime(&now);
+               for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) {
+                       lplru_next = TAILQ_NEXT(lplru, lp_lru);
+                       if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) {
+                               /*
+                                * If the oldest LRU entry is relatively new, then don't
+                                * bother scanning any further.
+                                */
+                               break;
+                       }
+                       /* remove entry from LRU, and check if it's still in use */
+                       TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru);
+                       if (!lplru->lp_valid || !(plru = pfind(lplru->lp_pid)) ||
+                           timevalcmp(&lplru->lp_pid_start, &plru->p_stats->p_start, !=)) {
+                               /* no longer in use */
+                               LIST_REMOVE(lplru, lp_hash);
+                               if (!lp) {
+                                       /* we'll reuse this one */
+                                       lp = lplru;
+                               } else {
+                                       /* we can free this one */
+                                       FREE(lplru, M_TEMP);
+                               }
+                       } else {
+                               /* still in use */
+                               lplru->lp_time = now.tv_sec;
+                               TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru);
+                       }
+                       /* don't check too many entries at once */
+                       if (++lrucnt > 8)
+                               break;
+               }
+               if (!lp) {
+                       /* we need to allocate a new one */
+                       MALLOC(lp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid),
+                               M_TEMP, M_WAITOK | M_ZERO);
+               }
+               if (!lp) {
+                       error = ENOMEM;
+               } else {
+                       /* (re)initialize nfs_lock_pid info */
+                       lp->lp_pid = proc_pid(p);
+                       lp->lp_pid_start = p->p_stats->p_start;
+                       /* insert pid in hash */
+                       LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash);
+                       lp->lp_valid = 1;
+                       lp->lp_time = now.tv_sec;
+                       TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
+                       error = 0;
+               }
+       }
+
+       /* unlock hash */
+       if (nfs_lock_pid_lock < 0) {
+               nfs_lock_pid_lock = 0;
+               wakeup(&nfs_lock_pid_lock);
+       } else
+               nfs_lock_pid_lock = 0;
+
+       return (error);
+}
+
 
 /*
  * nfs_advlock --
  *      NFS advisory byte-level locks.
  */
 int
-nfs_dolock(struct vop_advlock_args *ap)
-/* struct vop_advlock_args {
-        struct vnodeop_desc *a_desc;
-        struct vnode *a_vp;
-        caddr_t a_id;
-        int a_op;
-        struct flock *a_fl;
-        int a_flags;
+nfs_dolock(struct vnop_advlock_args *ap)
+/* struct vnop_advlock_args {
+       struct vnodeop_desc *a_desc;
+       vnode_t a_vp;
+       caddr_t a_id;
+       int a_op;
+       struct flock *a_fl;
+       int a_flags;
+       vfs_context_t a_context;
 }; */
 {
-       LOCKD_MSG msg;
-       struct nameidata nd;
-       struct vnode *vp, *wvp;
+       LOCKD_MSG_REQUEST msgreq;
+       LOCKD_MSG *msg;
+       vnode_t vp, wvp;
        struct nfsnode *np;
        int error, error1;
        struct flock *fl;
        int fmode, ioflg;
-       struct proc *p;
-        struct uthread *ut;
-       struct timeval elapsed;
        struct nfsmount *nmp;
-       struct vattr vattr;
+       struct nfs_vattr nvattr;
        off_t start, end;
+       struct timeval now;
+       int timeo, endtime, lastmsg, wentdown = 0;
+       int lockpidcheck;
+       kauth_cred_t cred;
+       proc_t p;
+       struct sockaddr *saddr;
 
-        ut = get_bsdthread_info(current_act());
-       p = current_proc();
+       p = vfs_context_proc(ap->a_context);
+       cred = vfs_context_ucred(ap->a_context);
 
        vp = ap->a_vp;
        fl = ap->a_fl;
        np = VTONFS(vp);
 
-       nmp = VFSTONFS(vp->v_mount);
+       nmp = VFSTONFS(vnode_mount(vp));
        if (!nmp)
                return (ENXIO);
        if (nmp->nm_flag & NFSMNT_NOLOCKS)
-               return (EOPNOTSUPP);
+               return (ENOTSUP);
 
        /*
         * The NLM protocol doesn't allow the server to return an error
@@ -169,11 +476,11 @@ nfs_dolock(struct vop_advlock_args *ap)
                        return (EINVAL);
        }
        /*
-        * If daemon is running take a ref on its fifo
+        * If daemon is running take a ref on its fifo vnode
         */
-       if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) {
-               if (!nfslockdwaiting)
-                       return (EOPNOTSUPP);
+       if (!(wvp = nfslockdvnode)) {
+               if (!nfslockdwaiting && !nfslockdstarttimeout)
+                       return (ENOTSUP);
                /*
                 * Don't wake lock daemon if it hasn't been started yet and
                 * this is an unlock request (since we couldn't possibly
@@ -181,37 +488,63 @@ nfs_dolock(struct vop_advlock_args *ap)
                 * uninformed unlock request due to closef()'s behavior of doing
                 * unlocks on all files if a process has had a lock on ANY file.
                 */
-               if (!nfslockdfp && (fl->l_type == F_UNLCK))
+               if (!nfslockdvnode && (fl->l_type == F_UNLCK))
                        return (EINVAL);
-               /* wake up lock daemon */
-               (void)wakeup((void *)&nfslockdwaiting);
-               /* wait on nfslockdfp for a while to allow daemon to start */
-               tsleep((void *)&nfslockdfp, PCATCH | PUSER, "lockd", 60*hz);
-               /* check for nfslockdfp and f_data */
-               if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data))
-                       return (EOPNOTSUPP);
+               microuptime(&now);
+               if (nfslockdwaiting) {
+                       /* wake up lock daemon */
+                       nfslockdstarttimeout = now.tv_sec + 60;
+                       (void)wakeup((void *)&nfslockdwaiting);
+               }
+               /* wait on nfslockdvnode for a while to allow daemon to start */
+               while (!nfslockdvnode && (now.tv_sec < nfslockdstarttimeout)) {
+                       error = tsleep((void *)&nfslockdvnode, PCATCH | PUSER, "lockdstart", 2*hz);
+                       if (error && (error != EWOULDBLOCK))
+                               return (error);
+                       /* check that we still have our mount... */
+                       /* ...and that we still support locks */
+                       nmp = VFSTONFS(vnode_mount(vp));
+                       if (!nmp)
+                               return (ENXIO);
+                       if (nmp->nm_flag & NFSMNT_NOLOCKS)
+                               return (ENOTSUP);
+                       if (!error)
+                               break;
+                       microuptime(&now);
+               }
+               /*
+                * check for nfslockdvnode
+                * If it hasn't started by now, there's a problem.
+                */
+               if (!(wvp = nfslockdvnode))
+                       return (ENOTSUP);
+       }
+       error = vnode_getwithref(wvp);
+       if (error)
+               return (ENOTSUP);
+       error = vnode_ref(wvp);
+       if (error) {
+               vnode_put(wvp);
+               return (ENOTSUP);
        }
-       VREF(wvp);
+
        /*
-        * if there is no nfsowner table yet, allocate one.
+        * Need to check if this process has successfully acquired an NFS lock before.
+        * If not, and this is an unlock request we can simply return success here.
         */
-       if (ut->uu_nlminfo == NULL) {
+       lockpidcheck = nfs_lock_pid_check(p, 0, vp);
+       if (lockpidcheck) {
+               if (lockpidcheck != ENOENT) {
+                       vnode_rele(wvp);
+                       vnode_put(wvp);
+                       return (lockpidcheck);
+               }
                if (ap->a_op == F_UNLCK) {
-                       vrele(wvp);
+                       vnode_rele(wvp);
+                       vnode_put(wvp);
                        return (0);
                }
-               MALLOC(ut->uu_nlminfo, struct nlminfo *,
-                       sizeof(struct nlminfo), M_LOCKF, M_WAITOK | M_ZERO);
-               ut->uu_nlminfo->pid_start = p->p_stats->p_start;
        }
-       /*
-        * Fill in the information structure.
-        */
-       msg.lm_version = LOCKD_MSG_VERSION;
-       msg.lm_msg_ident.pid = p->p_pid;
-       msg.lm_msg_ident.ut = ut;
-       msg.lm_msg_ident.pid_start = ut->uu_nlminfo->pid_start;
-       msg.lm_msg_ident.msg_seq = ++(ut->uu_nlminfo->msg_seq);
 
        /*
         * The NFS Lock Manager protocol doesn't directly handle
@@ -233,23 +566,27 @@ nfs_dolock(struct vop_advlock_args *ap)
                /* need to flush, and refetch attributes to make */
                /* sure we have the correct end of file offset   */
                if (np->n_flag & NMODIFIED) {
-                       np->n_attrstamp = 0;
-                       error = nfs_vinvalbuf(vp, V_SAVE, p->p_ucred, p, 1);
+                       NATTRINVALIDATE(np);
+                       error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
                        if (error) {
-                               vrele(wvp);
+                               vnode_rele(wvp);
+                               vnode_put(wvp);
                                return (error);
                        }
                }
-               np->n_attrstamp = 0;
-               error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
+               NATTRINVALIDATE(np);
+
+               error = nfs_getattr(vp, &nvattr, cred, p);
                if (error) {
-                       vrele(wvp);
+                       vnode_rele(wvp);
+                       vnode_put(wvp);
                        return (error);
                }
                start = np->n_size + fl->l_start;
                break;
        default:
-               vrele(wvp);
+               vnode_rele(wvp);
+               vnode_put(wvp);
                return (EINVAL);
        }
        if (fl->l_len == 0)
@@ -261,128 +598,339 @@ nfs_dolock(struct vop_advlock_args *ap)
                start += fl->l_len;
        }
        if (start < 0) {
-               vrele(wvp);
+               vnode_rele(wvp);
+               vnode_put(wvp);
+               return (EINVAL);
+       }
+       if (!NFS_ISV3(vp) &&
+           ((start >= 0x80000000) || (end >= 0x80000000))) {
+               vnode_rele(wvp);
+               vnode_put(wvp);
                return (EINVAL);
        }
 
-       msg.lm_fl = *fl;
-       msg.lm_fl.l_start = start;
+       /*
+        * Fill in the information structure.
+        */
+       msgreq.lmr_answered = 0;
+       msgreq.lmr_errno = 0;
+       msgreq.lmr_saved_errno = 0;
+       msg = &msgreq.lmr_msg;
+       msg->lm_version = LOCKD_MSG_VERSION;
+       msg->lm_flags = 0;
+
+       msg->lm_fl = *fl;
+       msg->lm_fl.l_start = start;
        if (end != -1)
-               msg.lm_fl.l_len = end - start + 1;
+               msg->lm_fl.l_len = end - start + 1;
+       msg->lm_fl.l_pid = proc_pid(p);
 
-       msg.lm_wait = ap->a_flags & F_WAIT;
-       msg.lm_getlk = ap->a_op == F_GETLK;
+       if (ap->a_flags & F_WAIT)
+               msg->lm_flags |= LOCKD_MSG_BLOCK;
+       if (ap->a_op == F_GETLK)
+               msg->lm_flags |= LOCKD_MSG_TEST;
 
-       nmp = VFSTONFS(vp->v_mount);
+       nmp = VFSTONFS(vnode_mount(vp));
        if (!nmp) {
-               vrele(wvp);
+               vnode_rele(wvp);
+               vnode_put(wvp);
                return (ENXIO);
        }
 
-       bcopy(mtod(nmp->nm_nam, struct sockaddr *), &msg.lm_addr,
-             min(sizeof msg.lm_addr,
-                 mtod(nmp->nm_nam, struct sockaddr *)->sa_len));
-       msg.lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
-       bcopy(VTONFS(vp)->n_fhp, msg.lm_fh, msg.lm_fh_len);
-       msg.lm_nfsv3 = NFS_ISV3(vp);
-       cru2x(p->p_ucred, &msg.lm_cred);
+       saddr = mbuf_data(nmp->nm_nam);
+       bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len));
+       msg->lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
+       bcopy(VTONFS(vp)->n_fhp, msg->lm_fh, msg->lm_fh_len);
+       if (NFS_ISV3(vp))
+               msg->lm_flags |= LOCKD_MSG_NFSV3;
+       cru2x(cred, &msg->lm_cred);
 
-       microuptime(&ut->uu_nlminfo->nlm_lockstart);
+       microuptime(&now);
+       lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
 
        fmode = FFLAGS(O_WRONLY);
-       if ((error = VOP_OPEN(wvp, fmode, kernproc->p_ucred, p))) {
-               vrele(wvp);
+       if ((error = VNOP_OPEN(wvp, fmode, ap->a_context))) {
+               vnode_rele(wvp);
+               vnode_put(wvp);
                return (error);
        }
+       vnode_lock(wvp);
        ++wvp->v_writecount;
+       vnode_unlock(wvp);
+
+       /* allocate unique xid */
+       msg->lm_xid = nfs_lockxid_get();
+       nfs_lockdmsg_enqueue(&msgreq);
 
+       timeo = 2*hz;
 #define IO_NOMACCHECK 0;
        ioflg = IO_UNIT | IO_NOMACCHECK;
        for (;;) {
-               VOP_LEASE(wvp, p, kernproc->p_ucred, LEASE_WRITE);
-
+               error = 0;
                while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) {
                        nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT;
-                       if (tsleep((void *)&nfslockdfifolock, PCATCH | PUSER, "lockdfifo", 20*hz))
+                       error = tsleep((void *)&nfslockdfifolock,
+                                       PCATCH | PUSER, "lockdfifo", 20*hz);
+                       if (error)
                                break;
                }
+               if (error)
+                       break;
                nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED;
 
-               error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)&msg, sizeof(msg), 0,
-                   UIO_SYSSPACE, ioflg, kernproc->p_ucred, NULL, p);
-
-               nfslockdfifowritten = 1;
+               error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)msg, sizeof(*msg), 0,
+                   UIO_SYSSPACE32, ioflg, proc_ucred(kernproc), NULL, p);
 
                nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED;
                if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) {
                        nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT;
                        wakeup((void *)&nfslockdfifolock);
                }
-               /* wake up lock daemon */
-               if (nfslockdwaiting)
-                       (void)wakeup((void *)&nfslockdwaiting);
 
                if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) {
                        break;
                }
+
                /*
-                * If we're locking a file, wait for an answer.  Unlocks succeed
-                * immediately.
+                * Always wait for an answer.  Not waiting for unlocks could
+                * cause a lock to be left if the unlock request gets dropped.
                 */
-               if (fl->l_type == F_UNLCK)
-                       /*
-                        * XXX this isn't exactly correct.  The client side
-                        * needs to continue sending it's unlock until
-                        * it gets a response back.
-                        */
-                       break;
 
                /*
-                * retry after 20 seconds if we haven't gotten a response yet.
-                * This number was picked out of thin air... but is longer
-                * then even a reasonably loaded system should take (at least
-                * on a local network).  XXX Probably should use a back-off
-                * scheme.
+                * Retry if it takes too long to get a response.
+                *
+                * The timeout numbers were picked out of thin air... they start
+                * at 2 and double each timeout with a max of 60 seconds.
+                *
+                * In order to maintain responsiveness, we pass a small timeout
+                * to tsleep and calculate the timeouts ourselves.  This allows
+                * us to pick up on mount changes quicker.
                 */
-               if ((error = tsleep((void *)ut->uu_nlminfo,
-                                   PCATCH | PUSER, "lockd", 20*hz)) != 0) {
-                       if (error == EWOULDBLOCK) {
+wait_for_granted:
+               error = EWOULDBLOCK;
+               microuptime(&now);
+               if ((timeo/hz) > 0)
+                       endtime = now.tv_sec + timeo/hz;
+               else
+                       endtime = now.tv_sec + 1;
+               while (now.tv_sec < endtime) {
+                       error = tsleep((void *)&msgreq, PCATCH | PUSER, "lockd", 2*hz);
+                       if (msgreq.lmr_answered) {
+                               /*
+                                * Note: it's possible to have a lock granted at
+                                * essentially the same time that we get interrupted.
+                                * Since the lock may be granted, we can't return an
+                                * error from this request or we might not unlock the
+                                * lock that's been granted.
+                                */
+                               error = 0;
+                               break;
+                       }
+                       if (error != EWOULDBLOCK)
+                               break;
+                       /* check that we still have our mount... */
+                       /* ...and that we still support locks */
+                       nmp = VFSTONFS(vnode_mount(vp));
+                       if (!nmp || (nmp->nm_flag & NFSMNT_NOLOCKS))
+                               break;
+                       /*
+                        * If the mount is hung and we've requested not to hang
+                        * on remote filesystems, then bail now.
+                        */
+                       if ((p != NULL) && ((proc_noremotehang(p)) != 0) &&
+                           ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
+                               if (fl->l_type == F_UNLCK)
+                                       printf("nfs_dolock: aborting unlock request "
+                                           "due to timeout (noremotehang)\n");
+                               error = EIO;
+                               break;
+                       }
+                       microuptime(&now);
+               }
+               if (error) {
+                       /* check that we still have our mount... */
+                       nmp = VFSTONFS(vnode_mount(vp));
+                       if (!nmp) {
+                               if (error == EWOULDBLOCK)
+                                       error = ENXIO;
+                               break;
+                       }
+                       /* ...and that we still support locks */
+                       if (nmp->nm_flag & NFSMNT_NOLOCKS) {
+                               if (error == EWOULDBLOCK)
+                                       error = ENOTSUP;
+                               break;
+                       }
+                       if ((error == ENOTSUP) &&
+                           (nmp->nm_state & NFSSTA_LOCKSWORK)) {
+                               /*
+                                * We have evidence that locks work, yet lockd
+                                * returned ENOTSUP.  This is probably because
+                                * it was unable to contact the server's lockd to
+                                * send it the request.
+                                *
+                                * Because we know locks work, we'll consider
+                                * this failure to be a timeout.
+                                */
+                               error = EWOULDBLOCK;
+                       }
+                       if (error != EWOULDBLOCK) {
+                               /*
+                                * We're going to bail on this request.
+                                * If we were a blocked lock request, send a cancel.
+                                */
+                               if ((msgreq.lmr_errno == EINPROGRESS) &&
+                                   !(msg->lm_flags & LOCKD_MSG_CANCEL)) {
+                                       /* set this request up as a cancel */
+                                       msg->lm_flags |= LOCKD_MSG_CANCEL;
+                                       nfs_lockdmsg_dequeue(&msgreq);
+                                       msg->lm_xid = nfs_lockxid_get();
+                                       nfs_lockdmsg_enqueue(&msgreq);
+                                       msgreq.lmr_saved_errno = error;
+                                       msgreq.lmr_errno = 0;
+                                       msgreq.lmr_answered = 0;
+                                       /* reset timeout */
+                                       timeo = 2*hz;
+                                       /* send cancel request */
+                                       continue;
+                               }
+                               break;
+                       }
+
+                       /*
+                        * If the mount is hung and we've requested not to hang
+                        * on remote filesystems, then bail now.
+                        */
+                       if ((p != NULL) && ((proc_noremotehang(p)) != 0) &&
+                           ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
+                               if (fl->l_type == F_UNLCK)
+                                       printf("nfs_dolock: aborting unlock request "
+                                           "due to timeout (noremotehang)\n");
+                               error = EIO;
+                               break;
+                       }
+                       /* warn if we're not getting any response */
+                       microuptime(&now);
+                       if ((msgreq.lmr_errno != EINPROGRESS) &&
+                           (nmp->nm_tprintf_initial_delay != 0) &&
+                           ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
+                               lastmsg = now.tv_sec;
+                               nfs_down(nmp, p, 0, NFSSTA_LOCKTIMEO, "lockd not responding");
+                               wentdown = 1;
+                       }
+                       if (msgreq.lmr_errno == EINPROGRESS) {
                                /*
-                                * We timed out, so we rewrite the request
-                                * to the fifo, but only if it isn't already
-                                * full.
+                                * We've got a blocked lock request that we are
+                                * going to retry.  First, we'll want to try to
+                                * send a cancel for the previous request.
+                                *
+                                * Clear errno so if we don't get a response
+                                * to the resend we'll call nfs_down().
+                                * Also reset timeout because we'll expect a
+                                * quick response to the cancel/resend (even if
+                                * it is NLM_BLOCKED).
                                 */
-                               ioflg |= IO_NDELAY;
+                               msg->lm_flags |= LOCKD_MSG_CANCEL;
+                               nfs_lockdmsg_dequeue(&msgreq);
+                               msg->lm_xid = nfs_lockxid_get();
+                               nfs_lockdmsg_enqueue(&msgreq);
+                               msgreq.lmr_saved_errno = msgreq.lmr_errno;
+                               msgreq.lmr_errno = 0;
+                               msgreq.lmr_answered = 0;
+                               timeo = 2*hz;
+                               /* send cancel then resend request */
                                continue;
                        }
+                       /*
+                        * We timed out, so we will rewrite the request
+                        * to the fifo, but only if it isn't already full.
+                        */
+                       ioflg |= IO_NDELAY;
+                       timeo *= 2;
+                       if (timeo > 60*hz)
+                               timeo = 60*hz;
+                       /* resend request */
+                       continue;
+               }
 
-                       break;
+               /* we got a reponse, so the server's lockd is OK */
+               nfs_up(VFSTONFS(vnode_mount(vp)), p, NFSSTA_LOCKTIMEO,
+                       wentdown ? "lockd alive again" : NULL);
+               wentdown = 0;
+
+               if (msgreq.lmr_errno == EINPROGRESS) {
+                       /* got NLM_BLOCKED response */
+                       /* need to wait for NLM_GRANTED */
+                       timeo = 60*hz;
+                       msgreq.lmr_answered = 0;
+                       goto wait_for_granted;
+               }
+
+               if ((msg->lm_flags & LOCKD_MSG_CANCEL) &&
+                   (msgreq.lmr_saved_errno == EINPROGRESS)) {
+                       /*
+                        * We just got a successful reply to the
+                        * cancel of the previous blocked lock request.
+                        * Now, go ahead and resend the request.
+                        */
+                       msg->lm_flags &= ~LOCKD_MSG_CANCEL;
+                       nfs_lockdmsg_dequeue(&msgreq);
+                       msg->lm_xid = nfs_lockxid_get();
+                       nfs_lockdmsg_enqueue(&msgreq);
+                       msgreq.lmr_saved_errno = 0;
+                       msgreq.lmr_errno = 0;
+                       msgreq.lmr_answered = 0;
+                       timeo = 2*hz;
+                       /* resend request */
+                       continue;
                }
 
-               if (msg.lm_getlk && ut->uu_nlminfo->retcode == 0) {
-                       if (ut->uu_nlminfo->set_getlk) {
-                               fl->l_pid = ut->uu_nlminfo->getlk_pid;
-                               fl->l_start = ut->uu_nlminfo->getlk_start;
-                               fl->l_len = ut->uu_nlminfo->getlk_len;
+               if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) {
+                       if (msg->lm_fl.l_type != F_UNLCK) {
+                               fl->l_type = msg->lm_fl.l_type;
+                               fl->l_pid = msg->lm_fl.l_pid;
+                               fl->l_start = msg->lm_fl.l_start;
+                               fl->l_len = msg->lm_fl.l_len;
                                fl->l_whence = SEEK_SET;
                        } else {
                                fl->l_type = F_UNLCK;
                        }
                }
-               error = ut->uu_nlminfo->retcode;
+
+               /*
+                * If the blocked lock request was cancelled.
+                * Restore the error condition from when we
+                * originally bailed on the request.
+                */
+               if (msg->lm_flags & LOCKD_MSG_CANCEL) {
+                       msg->lm_flags &= ~LOCKD_MSG_CANCEL;
+                       error = msgreq.lmr_saved_errno;
+               } else
+                       error = msgreq.lmr_errno;
+
+               if (!error) {
+                       /* record that NFS file locking has worked on this mount */
+                       nmp = VFSTONFS(vnode_mount(vp));
+                       if (nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK))
+                               nmp->nm_state |= NFSSTA_LOCKSWORK;
+                       /*
+                        * If we successfully acquired a lock, make sure this pid
+                        * is in the nfs_lock_pid hash table so we know we can't
+                        * short-circuit unlock requests.
+                        */
+                       if ((lockpidcheck == ENOENT) &&
+                           ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW)))
+                               nfs_lock_pid_check(p, 1, vp);
+       
+               }
                break;
        }
 
-       /* XXX stats */
-       nfsadvlocks++;
-       microuptime(&elapsed);
-       timevalsub(&elapsed, &ut->uu_nlminfo->nlm_lockstart);
-       if (timevalcmp(&elapsed, &nfsadvlock_longest, >))
-               nfsadvlock_longest = elapsed;
-       timevaladd(&nfsadvlocks_time, &elapsed);
-       timerclear(&ut->uu_nlminfo->nlm_lockstart);
+       nfs_lockdmsg_dequeue(&msgreq);
 
-       error1 = vn_close(wvp, FWRITE, kernproc->p_ucred, p);
+       error1 = VNOP_CLOSE(wvp, FWRITE, ap->a_context);
+       vnode_rele(wvp);
+       vnode_put(wvp);
        /* prefer any previous 'error' to our vn_close 'error1'. */
        return (error != 0 ? error : error1);
 }
@@ -392,62 +940,58 @@ nfs_dolock(struct vop_advlock_args *ap)
  *      NFS advisory byte-level locks answer from the lock daemon.
  */
 int
-nfslockdans(struct proc *p, struct lockd_ans *ansp)
+nfslockdans(proc_t p, struct lockd_ans *ansp)
 {
-       struct proc *targetp;
-       struct uthread *targetut, *uth;
+       LOCKD_MSG_REQUEST *msgreq;
        int error;
 
-       /*
-        * Let root, or someone who once was root (lockd generally
-        * switches to the daemon uid once it is done setting up) make
-        * this call.
-        *
-        * XXX This authorization check is probably not right.
-        */
-       if ((error = suser(p->p_ucred, &p->p_acflag)) != 0 &&
-           p->p_cred->p_svuid != 0)
+       /* Let root make this call. */
+       error = proc_suser(p);
+       if (error)
                return (error);
 
        /* the version should match, or we're out of sync */
-       if (ansp->la_vers != LOCKD_ANS_VERSION)
+       if (ansp->la_version != LOCKD_ANS_VERSION)
                return (EINVAL);
 
-       /* Find the process & thread */
-       if ((targetp = pfind(ansp->la_msg_ident.pid)) == NULL)
-               return (ESRCH);
-       targetut = ansp->la_msg_ident.ut;
-       TAILQ_FOREACH(uth, &targetp->p_uthlist, uu_list) {
-               if (uth == targetut)
-                       break;
+       /* try to find the lockd message by transaction id (cookie) */
+       msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
+       if (ansp->la_flags & LOCKD_ANS_GRANTED) {
+               /*
+                * We can't depend on the granted message having our cookie,
+                * so we check the answer against the lockd message found.
+                * If no message was found or it doesn't match the answer,
+                * we look for the lockd message by the answer's lock info.
+                */
+               if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp))
+                       msgreq = nfs_lockdmsg_find_by_answer(ansp);
+               /*
+                * We need to make sure this request isn't being cancelled
+                * If it is, we don't want to accept the granted message.
+                */
+               if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL))
+                       msgreq = NULL;
        }
-       /*
-        * Verify the pid hasn't been reused (if we can), and it isn't waiting
-        * for an answer from a more recent request.  We return an EPIPE if
-        * the match fails, because we've already used ESRCH above, and this
-        * is sort of like writing on a pipe after the reader has closed it.
-        * If only the seq# is off, don't return an error just return.  It could
-        * just be a response to a retransmitted request.
-        */
-       if (uth == NULL || uth != targetut || targetut->uu_nlminfo == NULL)
+       if (!msgreq)
                return (EPIPE);
-       if (ansp->la_msg_ident.msg_seq != -1) {
-               if (timevalcmp(&targetut->uu_nlminfo->pid_start,
-                              &ansp->la_msg_ident.pid_start, !=))
-                       return (EPIPE);
-               if (targetut->uu_nlminfo->msg_seq != ansp->la_msg_ident.msg_seq)
-                       return (0);
-       }
-
-       /* Found the thread, so set its return errno and wake it up. */
 
-       targetut->uu_nlminfo->retcode = ansp->la_errno;
-       targetut->uu_nlminfo->set_getlk = ansp->la_getlk_set;
-       targetut->uu_nlminfo->getlk_pid = ansp->la_getlk_pid;
-       targetut->uu_nlminfo->getlk_start = ansp->la_getlk_start;
-       targetut->uu_nlminfo->getlk_len = ansp->la_getlk_len;
+       msgreq->lmr_errno = ansp->la_errno;
+       if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) {
+               if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) {
+                       if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL)
+                               msgreq->lmr_msg.lm_fl.l_type = F_WRLCK;
+                       else
+                               msgreq->lmr_msg.lm_fl.l_type = F_RDLCK;
+                       msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid;
+                       msgreq->lmr_msg.lm_fl.l_start = ansp->la_start;
+                       msgreq->lmr_msg.lm_fl.l_len = ansp->la_len;
+               } else {
+                       msgreq->lmr_msg.lm_fl.l_type = F_UNLCK;
+               }
+       }
 
-       (void)wakeup((void *)targetut->uu_nlminfo);
+       msgreq->lmr_answered = 1;
+       (void)wakeup((void *)msgreq);
 
        return (0);
 }
@@ -457,28 +1001,38 @@ nfslockdans(struct proc *p, struct lockd_ans *ansp)
  *      NFS advisory byte-level locks: fifo file# from the lock daemon.
  */
 int
-nfslockdfd(struct proc *p, int fd)
+nfslockdfd(proc_t p, int fd)
 {
        int error;
-       struct file *fp, *ofp;
+       vnode_t vp, oldvp;
 
-       error = suser(p->p_ucred, &p->p_acflag);
+       error = proc_suser(p);
        if (error)
                return (error);
        if (fd < 0) {
-               fp = 0;
+               vp = NULL;
        } else {
-               error = getvnode(p, fd, &fp);
+               error = file_vnode(fd, &vp);
                if (error)
                        return (error);
-               (void)fref(fp);
+               error = vnode_getwithref(vp);
+               if (error)
+                       return (error);
+               error = vnode_ref(vp);
+               if (error) {
+                       vnode_put(vp);
+                       return (error);
+               }
+       }
+       oldvp = nfslockdvnode;
+       nfslockdvnode = vp;
+       if (oldvp) {
+               vnode_rele(oldvp);
+       }
+       (void)wakeup((void *)&nfslockdvnode);
+       if (vp) {
+               vnode_put(vp);
        }
-       ofp = nfslockdfp;
-       nfslockdfp = fp;
-       if (ofp)
-               (void)frele(ofp);
-       nfslockdpid = nfslockdfp ? p->p_pid : 0;
-       (void)wakeup((void *)&nfslockdfp);
        return (0);
 }
 
@@ -487,23 +1041,17 @@ nfslockdfd(struct proc *p, int fd)
  *      lock daemon waiting for lock request
  */
 int
-nfslockdwait(struct proc *p)
+nfslockdwait(proc_t p)
 {
        int error;
-       struct file *fp, *ofp;
 
-       if (p->p_pid != nfslockdpid) {
-               error = suser(p->p_ucred, &p->p_acflag);
-               if (error)
-                       return (error);
-       }
-       if (nfslockdwaiting)
+       error = proc_suser(p);
+       if (error)
+               return (error);
+       if (nfslockdwaiting || nfslockdvnode)
                return (EBUSY);
-       if (nfslockdfifowritten) {
-               nfslockdfifowritten = 0;
-               return (0);
-       }
 
+       nfslockdstarttimeout = 0;
        nfslockdwaiting = 1;
        tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0);
        nfslockdwaiting = 0;