X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/ab86ba338a07a58a89f50cf7066a0f0e487ac0cc..eee3565979933af707c711411001ba11fe406a3c:/bsd/nfs/nfs_lock.c diff --git a/bsd/nfs/nfs_lock.c b/bsd/nfs/nfs_lock.c index e63ab6f25..aaf567271 100644 --- a/bsd/nfs/nfs_lock.c +++ b/bsd/nfs/nfs_lock.c @@ -1,16 +1,19 @@ /* - * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2016 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ - * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER @@ -20,7 +23,7 @@ * Please see the License for the specific language governing rights and * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /*- * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. @@ -57,22 +60,21 @@ #include #include #include /* for hz */ -#include -#include +#include #include #include /* for hz */ /* Must come after sys/malloc.h */ -#include -#include -#include -#include +#include +#include +#include /* for p_start */ +#include #include #include -#include #include #include -#include +#include -#include +#include +#include #include @@ -81,432 +83,924 @@ #include #include #include +#include #include #include #include -#include -#define OFF_MAX QUAD_MAX +#include +#include +#include +#include + +extern void ipc_port_release_send(ipc_port_t); + +/* + * pending lock request messages are kept in this queue which is + * kept sorted by transaction ID (xid). + */ +static uint64_t nfs_lockxid = 0; +static LOCKD_MSG_QUEUE nfs_pendlockq; + +/* list of mounts that are (potentially) making lockd requests */ +TAILQ_HEAD(nfs_lockd_mount_list,nfsmount) nfs_lockd_mount_list; -uint64_t nfsadvlocks = 0; -struct timeval nfsadvlock_longest = {0, 0}; -struct timeval nfsadvlocks_time = {0, 0}; +static lck_grp_t *nfs_lock_lck_grp; +static lck_mtx_t *nfs_lock_mutex; -pid_t nfslockdpid = 0; -struct file *nfslockdfp = 0; -int nfslockdwaiting = 0; -int nfslockdfifowritten = 0; -int nfslockdfifolock = 0; -#define NFSLOCKDFIFOLOCK_LOCKED 1 -#define NFSLOCKDFIFOLOCK_WANT 2 +void nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *); +void nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *); +int nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *, struct lockd_ans *); +LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_answer(struct lockd_ans *); +LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_xid(uint64_t); +uint64_t nfs_lockxid_get(void); +int nfs_lockd_send_request(LOCKD_MSG *, int); /* - * XXX - * We have to let the process know if the call succeeded. I'm using an extra - * field in the uu_nlminfo field in the uthread structure, as it is already for - * lockd stuff. + * initialize global nfs lock state */ +void +nfs_lockinit(void) +{ + TAILQ_INIT(&nfs_pendlockq); + TAILQ_INIT(&nfs_lockd_mount_list); + + nfs_lock_lck_grp = lck_grp_alloc_init("nfs_lock", LCK_GRP_ATTR_NULL); + nfs_lock_mutex = lck_mtx_alloc_init(nfs_lock_lck_grp, LCK_ATTR_NULL); +} /* - * nfs_advlock -- - * NFS advisory byte-level locks. + * Register a mount as (potentially) making lockd requests. */ -int -nfs_dolock(struct vop_advlock_args *ap) -/* struct vop_advlock_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; -}; */ +void +nfs_lockd_mount_register(struct nfsmount *nmp) { - LOCKD_MSG msg; - struct nameidata nd; - struct vnode *vp, *wvp; - struct nfsnode *np; - int error, error1; - struct flock *fl; - int fmode, ioflg; - struct proc *p; - struct uthread *ut; - struct timeval elapsed; - struct nfsmount *nmp; - struct vattr vattr; - off_t start, end; + lck_mtx_lock(nfs_lock_mutex); + TAILQ_INSERT_HEAD(&nfs_lockd_mount_list, nmp, nm_ldlink); + nfs_lockd_mounts++; + lck_mtx_unlock(nfs_lock_mutex); +} + +/* + * Unregister a mount as (potentially) making lockd requests. + * + * When the lockd mount count drops to zero, then send a shutdown request to + * lockd if we've sent any requests to it. + */ +void +nfs_lockd_mount_unregister(struct nfsmount *nmp) +{ + int send_shutdown; + mach_port_t lockd_port = IPC_PORT_NULL; + kern_return_t kr; + + lck_mtx_lock(nfs_lock_mutex); + if (nmp->nm_ldlink.tqe_next == NFSNOLIST) { + lck_mtx_unlock(nfs_lock_mutex); + return; + } + + TAILQ_REMOVE(&nfs_lockd_mount_list, nmp, nm_ldlink); + nmp->nm_ldlink.tqe_next = NFSNOLIST; - ut = get_bsdthread_info(current_act()); - p = current_proc(); + nfs_lockd_mounts--; - vp = ap->a_vp; - fl = ap->a_fl; - np = VTONFS(vp); + /* send a shutdown request if there are no more lockd mounts */ + send_shutdown = ((nfs_lockd_mounts == 0) && nfs_lockd_request_sent); + if (send_shutdown) + nfs_lockd_request_sent = 0; - nmp = VFSTONFS(vp->v_mount); - if (!nmp) - return (ENXIO); - if (nmp->nm_flag & NFSMNT_NOLOCKS) - return (EOPNOTSUPP); + lck_mtx_unlock(nfs_lock_mutex); + + if (!send_shutdown) + return; /* - * The NLM protocol doesn't allow the server to return an error - * on ranges, so we do it. Pre LFS (Large File Summit) - * standards required EINVAL for the range errors. More recent - * standards use EOVERFLOW, but their EINVAL wording still - * encompasses these errors. - * Any code sensitive to this is either: - * 1) written pre-LFS and so can handle only EINVAL, or - * 2) written post-LFS and thus ought to be tolerant of pre-LFS - * implementations. - * Since returning EOVERFLOW certainly breaks 1), we return EINVAL. - */ - if (fl->l_whence != SEEK_END) { - if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) || - fl->l_start < 0 || - (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) || - (fl->l_len < 0 && fl->l_start + fl->l_len < 0)) - return (EINVAL); - } - /* - * If daemon is running take a ref on its fifo - */ - if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) { - if (!nfslockdwaiting) - return (EOPNOTSUPP); - /* - * Don't wake lock daemon if it hasn't been started yet and - * this is an unlock request (since we couldn't possibly - * actually have a lock on the file). This could be an - * uninformed unlock request due to closef()'s behavior of doing - * unlocks on all files if a process has had a lock on ANY file. - */ - if (!nfslockdfp && (fl->l_type == F_UNLCK)) - return (EINVAL); - /* wake up lock daemon */ - (void)wakeup((void *)&nfslockdwaiting); - /* wait on nfslockdfp for a while to allow daemon to start */ - tsleep((void *)&nfslockdfp, PCATCH | PUSER, "lockd", 60*hz); - /* check for nfslockdfp and f_data */ - if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) - return (EOPNOTSUPP); - } - VREF(wvp); - /* - * if there is no nfsowner table yet, allocate one. + * Let lockd know that it is no longer needed for any NFS mounts */ - if (ut->uu_nlminfo == NULL) { - if (ap->a_op == F_UNLCK) { - vrele(wvp); - return (0); - } - MALLOC(ut->uu_nlminfo, struct nlminfo *, - sizeof(struct nlminfo), M_LOCKF, M_WAITOK | M_ZERO); - ut->uu_nlminfo->pid_start = p->p_stats->p_start; + kr = host_get_lockd_port(host_priv_self(), &lockd_port); + if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(lockd_port)) { + printf("nfs_lockd_mount_change: shutdown couldn't get port, kr %d, port %s\n", + kr, (lockd_port == IPC_PORT_NULL) ? "NULL" : + (lockd_port == IPC_PORT_DEAD) ? "DEAD" : "VALID"); + return; } - /* - * Fill in the information structure. - */ - msg.lm_version = LOCKD_MSG_VERSION; - msg.lm_msg_ident.pid = p->p_pid; - msg.lm_msg_ident.ut = ut; - msg.lm_msg_ident.pid_start = ut->uu_nlminfo->pid_start; - msg.lm_msg_ident.msg_seq = ++(ut->uu_nlminfo->msg_seq); - /* - * The NFS Lock Manager protocol doesn't directly handle - * negative lengths or SEEK_END, so we need to normalize - * things here where we have all the info. - * (Note: SEEK_CUR is already adjusted for at this point) - */ - /* Convert the flock structure into a start and end. */ - switch (fl->l_whence) { - case SEEK_SET: - case SEEK_CUR: - /* - * Caller is responsible for adding any necessary offset - * to fl->l_start when SEEK_CUR is used. - */ - start = fl->l_start; - break; - case SEEK_END: - /* need to flush, and refetch attributes to make */ - /* sure we have the correct end of file offset */ - if (np->n_flag & NMODIFIED) { - np->n_xid = 0; - error = nfs_vinvalbuf(vp, V_SAVE, p->p_ucred, p, 1); - if (error) { - vrele(wvp); - return (error); - } - } - np->n_xid = 0; - error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); - if (error) { - vrele(wvp); - return (error); - } - start = np->n_size + fl->l_start; - break; - default: - vrele(wvp); - return (EINVAL); + kr = lockd_shutdown(lockd_port); + if (kr != KERN_SUCCESS) + printf("nfs_lockd_mount_change: shutdown %d\n", kr); + + ipc_port_release_send(lockd_port); +} + +/* + * insert a lock request message into the pending queue + * (nfs_lock_mutex must be held) + */ +void +nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq) +{ + LOCKD_MSG_REQUEST *mr; + + mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue); + if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) { + /* fast path: empty queue or new largest xid */ + TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next); + return; } - if (fl->l_len == 0) - end = -1; - else if (fl->l_len > 0) - end = start + fl->l_len - 1; - else { /* l_len is negative */ - end = start - 1; - start += fl->l_len; + /* slow path: need to walk list to find insertion point */ + while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) { + mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next); } - if (start < 0) { - vrele(wvp); - return (EINVAL); + if (mr) { + TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next); + } else { + TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next); } +} - msg.lm_fl = *fl; - msg.lm_fl.l_start = start; - if (end != -1) - msg.lm_fl.l_len = end - start + 1; +/* + * remove a lock request message from the pending queue + * (nfs_lock_mutex must be held) + */ +void +nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq) +{ + TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next); +} - msg.lm_wait = ap->a_flags & F_WAIT; - msg.lm_getlk = ap->a_op == F_GETLK; +/* + * find a pending lock request message by xid + * + * We search from the head of the list assuming that the message we're + * looking for is for an older request (because we have an answer to it). + * This assumes that lock request will be answered primarily in FIFO order. + * However, this may not be the case if there are blocked requests. We may + * want to move blocked requests to a separate queue (but that'll complicate + * duplicate xid checking). + * + * (nfs_lock_mutex must be held) + */ +LOCKD_MSG_REQUEST * +nfs_lockdmsg_find_by_xid(uint64_t lockxid) +{ + LOCKD_MSG_REQUEST *mr; - nmp = VFSTONFS(vp->v_mount); - if (!nmp) { - vrele(wvp); - return (ENXIO); + TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) { + if (mr->lmr_msg.lm_xid == lockxid) + return mr; + if (mr->lmr_msg.lm_xid > lockxid) + return NULL; } + return mr; +} - bcopy(mtod(nmp->nm_nam, struct sockaddr *), &msg.lm_addr, - min(sizeof msg.lm_addr, - mtod(nmp->nm_nam, struct sockaddr *)->sa_len)); - msg.lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH; - bcopy(VTONFS(vp)->n_fhp, msg.lm_fh, msg.lm_fh_len); - msg.lm_nfsv3 = NFS_ISV3(vp); - cru2x(p->p_ucred, &msg.lm_cred); +/* + * Because we can't depend on nlm_granted messages containing the same + * cookie we sent with the original lock request, we need code to test + * if an nlm_granted answer matches the lock request. We also need code + * that can find a lockd message based solely on the nlm_granted answer. + */ - microuptime(&ut->uu_nlminfo->nlm_lockstart); +/* + * compare lockd message to answer + * + * returns 0 on equality and 1 if different + */ +int +nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp) +{ + if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO)) + return 1; + if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid) + return 1; + if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start) + return 1; + if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len) + return 1; + if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len) + return 1; + if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len)) + return 1; + return 0; +} - fmode = FFLAGS(O_WRONLY); - if ((error = VOP_OPEN(wvp, fmode, kernproc->p_ucred, p))) { - vrele(wvp); - return (error); +/* + * find a pending lock request message based on the lock info provided + * in the lockd_ans/nlm_granted data. We need this because we can't + * depend on nlm_granted messages containing the same cookie we sent + * with the original lock request. + * + * We search from the head of the list assuming that the message we're + * looking for is for an older request (because we have an answer to it). + * This assumes that lock request will be answered primarily in FIFO order. + * However, this may not be the case if there are blocked requests. We may + * want to move blocked requests to a separate queue (but that'll complicate + * duplicate xid checking). + * + * (nfs_lock_mutex must be held) + */ +LOCKD_MSG_REQUEST * +nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp) +{ + LOCKD_MSG_REQUEST *mr; + + if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO)) + return NULL; + TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) { + if (!nfs_lockdmsg_compare_to_answer(mr, ansp)) + break; } - ++wvp->v_writecount; + return mr; +} -#define IO_NOMACCHECK 0; - ioflg = IO_UNIT | IO_NOMACCHECK; - for (;;) { - VOP_LEASE(wvp, p, kernproc->p_ucred, LEASE_WRITE); +/* + * return the next unique lock request transaction ID + * (nfs_lock_mutex must be held) + */ +uint64_t +nfs_lockxid_get(void) +{ + LOCKD_MSG_REQUEST *mr; - while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) { - nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT; - if (tsleep((void *)&nfslockdfifolock, PCATCH | PUSER, "lockdfifo", 20*hz)) - break; + /* derive initial lock xid from system time */ + if (!nfs_lockxid) { + /* + * Note: it's OK if this code inits nfs_lockxid to 0 (for example, + * due to a broken clock) because we immediately increment it + * and we guarantee to never use xid 0. So, nfs_lockxid should only + * ever be 0 the first time this function is called. + */ + struct timeval tv; + microtime(&tv); + nfs_lockxid = (uint64_t)tv.tv_sec << 12; + } + + /* make sure we get a unique xid */ + do { + /* Skip zero xid if it should ever happen. */ + if (++nfs_lockxid == 0) + nfs_lockxid++; + if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) || + (mr->lmr_msg.lm_xid < nfs_lockxid)) { + /* fast path: empty queue or new largest xid */ + break; } - nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED; + /* check if xid is already in use */ + } while (nfs_lockdmsg_find_by_xid(nfs_lockxid)); - error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)&msg, sizeof(msg), 0, - UIO_SYSSPACE, ioflg, kernproc->p_ucred, NULL, p); + return nfs_lockxid; +} - nfslockdfifowritten = 1; +#define MACH_MAX_TRIES 3 - nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED; - if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) { - nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT; - wakeup((void *)&nfslockdfifolock); - } - /* wake up lock daemon */ - if (nfslockdwaiting) - (void)wakeup((void *)&nfslockdwaiting); +int +nfs_lockd_send_request(LOCKD_MSG *msg, int interruptable) +{ + kern_return_t kr; + int retries = 0; + mach_port_t lockd_port = IPC_PORT_NULL; + + kr = host_get_lockd_port(host_priv_self(), &lockd_port); + if (kr != KERN_SUCCESS || !IPC_PORT_VALID(lockd_port)) + return (ENOTSUP); + + do { + /* In the kernel all mach messaging is interruptable */ + do { + kr = lockd_request( + lockd_port, + msg->lm_version, + msg->lm_flags, + msg->lm_xid, + msg->lm_fl.l_start, + msg->lm_fl.l_len, + msg->lm_fl.l_pid, + msg->lm_fl.l_type, + msg->lm_fl.l_whence, + (uint32_t *)&msg->lm_addr, + (uint32_t *)&msg->lm_cred, + msg->lm_fh_len, + msg->lm_fh); + if (kr != KERN_SUCCESS) + printf("lockd_request received %d!\n", kr); + } while (!interruptable && kr == MACH_SEND_INTERRUPTED); + } while (kr == MIG_SERVER_DIED && retries++ < MACH_MAX_TRIES); + + ipc_port_release_send(lockd_port); + switch (kr) { + case MACH_SEND_INTERRUPTED: + return (EINTR); + default: + /* + * Other MACH or MIG errors we will retry. Eventually + * we will call nfs_down and allow the user to disable + * locking. + */ + return (EAGAIN); + } +} + +/* + * NFS advisory byte-level locks (client) + */ +int +nfs3_lockd_request( + nfsnode_t np, + int type, + LOCKD_MSG_REQUEST *msgreq, + int flags, + thread_t thd) +{ + LOCKD_MSG *msg = &msgreq->lmr_msg; + int error, error2; + int interruptable, slpflag; + struct nfsmount *nmp; + struct timeval now; + int timeo, starttime, endtime, lastmsg, wentdown = 0; + struct timespec ts; + struct sockaddr *saddr; - if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) { + nmp = NFSTONMP(np); + if (!nmp || !nmp->nm_saddr) + return (ENXIO); + + lck_mtx_lock(&nmp->nm_lock); + saddr = nmp->nm_saddr; + bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len)); + if (nmp->nm_vers == NFS_VER3) + msg->lm_flags |= LOCKD_MSG_NFSV3; + + if (nmp->nm_sotype != SOCK_DGRAM) + msg->lm_flags |= LOCKD_MSG_TCP; + + microuptime(&now); + starttime = now.tv_sec; + lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay)); + interruptable = NMFLAG(nmp, INTR); + lck_mtx_unlock(&nmp->nm_lock); + + lck_mtx_lock(nfs_lock_mutex); + + /* allocate unique xid */ + msg->lm_xid = nfs_lockxid_get(); + nfs_lockdmsg_enqueue(msgreq); + + timeo = 4; + + for (;;) { + nfs_lockd_request_sent = 1; + + /* need to drop nfs_lock_mutex while calling nfs_lockd_send_request() */ + lck_mtx_unlock(nfs_lock_mutex); + error = nfs_lockd_send_request(msg, interruptable); + lck_mtx_lock(nfs_lock_mutex); + if (error && error != EAGAIN) break; - } + /* - * If we're locking a file, wait for an answer. Unlocks succeed - * immediately. + * Always wait for an answer. Not waiting for unlocks could + * cause a lock to be left if the unlock request gets dropped. */ - if (fl->l_type == F_UNLCK) - /* - * XXX this isn't exactly correct. The client side - * needs to continue sending it's unlock until - * it gets a response back. - */ - break; /* - * retry after 20 seconds if we haven't gotten a response yet. - * This number was picked out of thin air... but is longer - * then even a reasonably loaded system should take (at least - * on a local network). XXX Probably should use a back-off - * scheme. + * Retry if it takes too long to get a response. + * + * The timeout numbers were picked out of thin air... they start + * at 4 and double each timeout with a max of 30 seconds. + * + * In order to maintain responsiveness, we pass a small timeout + * to msleep and calculate the timeouts ourselves. This allows + * us to pick up on mount changes quicker. */ - if ((error = tsleep((void *)ut->uu_nlminfo, - PCATCH | PUSER, "lockd", 20*hz)) != 0) { - if (error == EWOULDBLOCK) { +wait_for_granted: + error = EWOULDBLOCK; + slpflag = (interruptable && (type != F_UNLCK)) ? PCATCH : 0; + ts.tv_sec = 2; + ts.tv_nsec = 0; + microuptime(&now); + endtime = now.tv_sec + timeo; + while (now.tv_sec < endtime) { + error = error2 = 0; + if (!msgreq->lmr_answered) { + error = msleep(msgreq, nfs_lock_mutex, slpflag | PUSER, "lockd", &ts); + slpflag = 0; + } + if (msgreq->lmr_answered) { /* - * We timed out, so we rewrite the request - * to the fifo, but only if it isn't already - * full. + * Note: it's possible to have a lock granted at + * essentially the same time that we get interrupted. + * Since the lock may be granted, we can't return an + * error from this request or we might not unlock the + * lock that's been granted. */ - ioflg |= IO_NDELAY; + nmp = NFSTONMP(np); + if ((msgreq->lmr_errno == ENOTSUP) && nmp && + (nmp->nm_state & NFSSTA_LOCKSWORK)) { + /* + * We have evidence that locks work, yet lockd + * returned ENOTSUP. This is probably because + * it was unable to contact the server's lockd + * to send it the request. + * + * Because we know locks work, we'll consider + * this failure to be a timeout. + */ + error = EWOULDBLOCK; + } else { + error = 0; + } + break; + } + if (error != EWOULDBLOCK) + break; + /* check that we still have our mount... */ + /* ...and that we still support locks */ + /* ...and that there isn't a recovery pending */ + nmp = NFSTONMP(np); + if ((error2 = nfs_sigintr(nmp, NULL, NULL, 0))) { + error = error2; + if (type == F_UNLCK) + printf("nfs3_lockd_request: aborting unlock request, error %d\n", error); + break; + } + lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) { + lck_mtx_unlock(&nmp->nm_lock); + break; + } + if ((nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) { + /* recovery pending... return an error that'll get this operation restarted */ + error = NFSERR_GRACE; + lck_mtx_unlock(&nmp->nm_lock); + break; + } + interruptable = NMFLAG(nmp, INTR); + lck_mtx_unlock(&nmp->nm_lock); + microuptime(&now); + } + if (error) { + /* check that we still have our mount... */ + nmp = NFSTONMP(np); + if ((error2 = nfs_sigintr(nmp, NULL, NULL, 0))) { + error = error2; + if (error2 != EINTR) { + if (type == F_UNLCK) + printf("nfs3_lockd_request: aborting unlock request, error %d\n", error); + break; + } + } + /* ...and that we still support locks */ + lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) { + if (error == EWOULDBLOCK) + error = ENOTSUP; + lck_mtx_unlock(&nmp->nm_lock); + break; + } + /* ...and that there isn't a recovery pending */ + if ((error == EWOULDBLOCK) && (nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) { + /* recovery pending... return to allow recovery to occur */ + error = NFSERR_DENIED; + lck_mtx_unlock(&nmp->nm_lock); + break; + } + interruptable = NMFLAG(nmp, INTR); + if ((error != EWOULDBLOCK) || + ((nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) || + ((flags & R_RECOVER) && ((now.tv_sec - starttime) > 30))) { + if ((error == EWOULDBLOCK) && (flags & R_RECOVER)) { + /* give up if this is for recovery and taking too long */ + error = ETIMEDOUT; + } else if ((nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) { + /* recovery pending... return an error that'll get this operation restarted */ + error = NFSERR_GRACE; + } + lck_mtx_unlock(&nmp->nm_lock); + /* + * We're going to bail on this request. + * If we were a blocked lock request, send a cancel. + */ + if ((msgreq->lmr_errno == EINPROGRESS) && + !(msg->lm_flags & LOCKD_MSG_CANCEL)) { + /* set this request up as a cancel */ + msg->lm_flags |= LOCKD_MSG_CANCEL; + nfs_lockdmsg_dequeue(msgreq); + msg->lm_xid = nfs_lockxid_get(); + nfs_lockdmsg_enqueue(msgreq); + msgreq->lmr_saved_errno = error; + msgreq->lmr_errno = 0; + msgreq->lmr_answered = 0; + /* reset timeout */ + timeo = 2; + /* send cancel request */ + continue; + } + break; + } + + /* warn if we're not getting any response */ + microuptime(&now); + if ((msgreq->lmr_errno != EINPROGRESS) && + !(msg->lm_flags & LOCKD_MSG_DENIED_GRACE) && + (nmp->nm_tprintf_initial_delay != 0) && + ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) { + lck_mtx_unlock(&nmp->nm_lock); + lastmsg = now.tv_sec; + nfs_down(nmp, thd, 0, NFSSTA_LOCKTIMEO, "lockd not responding", 0); + wentdown = 1; + } else + lck_mtx_unlock(&nmp->nm_lock); + + if (msgreq->lmr_errno == EINPROGRESS) { + /* + * We've got a blocked lock request that we are + * going to retry. First, we'll want to try to + * send a cancel for the previous request. + * + * Clear errno so if we don't get a response + * to the resend we'll call nfs_down(). + * Also reset timeout because we'll expect a + * quick response to the cancel/resend (even if + * it is NLM_BLOCKED). + */ + msg->lm_flags |= LOCKD_MSG_CANCEL; + nfs_lockdmsg_dequeue(msgreq); + msg->lm_xid = nfs_lockxid_get(); + nfs_lockdmsg_enqueue(msgreq); + msgreq->lmr_saved_errno = msgreq->lmr_errno; + msgreq->lmr_errno = 0; + msgreq->lmr_answered = 0; + timeo = 2; + /* send cancel then resend request */ continue; } + /* + * We timed out, so we will resend the request. + */ + if (!(flags & R_RECOVER)) + timeo *= 2; + if (timeo > 30) + timeo = 30; + /* resend request */ + continue; + } + + /* we got a reponse, so the server's lockd is OK */ + nfs_up(NFSTONMP(np), thd, NFSSTA_LOCKTIMEO, + wentdown ? "lockd alive again" : NULL); + wentdown = 0; + + if (msgreq->lmr_answered && (msg->lm_flags & LOCKD_MSG_DENIED_GRACE)) { + /* + * The lock request was denied because the server lockd is + * still in its grace period. So, we need to try the + * request again in a little bit. Return the GRACE error so + * the higher levels can perform the retry. + */ + msgreq->lmr_saved_errno = msgreq->lmr_errno = error = NFSERR_GRACE; + } + + if (msgreq->lmr_errno == EINPROGRESS) { + /* got NLM_BLOCKED response */ + /* need to wait for NLM_GRANTED */ + timeo = 30; + msgreq->lmr_answered = 0; + goto wait_for_granted; + } + + if ((msg->lm_flags & LOCKD_MSG_CANCEL) && + (msgreq->lmr_saved_errno == EINPROGRESS)) { + /* + * We just got a successful reply to the + * cancel of the previous blocked lock request. + * Now, go ahead and return a DENIED error so the + * higher levels can resend the request. + */ + msg->lm_flags &= ~LOCKD_MSG_CANCEL; + nfs_lockdmsg_dequeue(msgreq); + error = NFSERR_DENIED; break; } - if (msg.lm_getlk && ut->uu_nlminfo->retcode == 0) { - if (ut->uu_nlminfo->set_getlk) { - fl->l_pid = ut->uu_nlminfo->getlk_pid; - fl->l_start = ut->uu_nlminfo->getlk_start; - fl->l_len = ut->uu_nlminfo->getlk_len; - fl->l_whence = SEEK_SET; - } else { - fl->l_type = F_UNLCK; + /* + * If the blocked lock request was cancelled. + * Restore the error condition from when we + * originally bailed on the request. + */ + if (msg->lm_flags & LOCKD_MSG_CANCEL) { + msg->lm_flags &= ~LOCKD_MSG_CANCEL; + error = msgreq->lmr_saved_errno; + } else { + error = msgreq->lmr_errno; + } + + nmp = NFSTONMP(np); + if ((error == ENOTSUP) && nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK)) { + /* + * We have NO evidence that locks work and lockd + * returned ENOTSUP. Let's take this as a hint + * that locks aren't supported and disable them + * for this mount. + */ + nfs_lockdmsg_dequeue(msgreq); + lck_mtx_unlock(nfs_lock_mutex); + lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED) { + nmp->nm_lockmode = NFS_LOCK_MODE_DISABLED; + nfs_lockd_mount_unregister(nmp); + } + nmp->nm_state &= ~NFSSTA_LOCKTIMEO; + lck_mtx_unlock(&nmp->nm_lock); + printf("lockd returned ENOTSUP, disabling locks for nfs server: %s\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname); + return (error); + } + if (!error) { + /* record that NFS file locking has worked on this mount */ + if (nmp) { + lck_mtx_lock(&nmp->nm_lock); + if (!(nmp->nm_state & NFSSTA_LOCKSWORK)) + nmp->nm_state |= NFSSTA_LOCKSWORK; + lck_mtx_unlock(&nmp->nm_lock); } } - error = ut->uu_nlminfo->retcode; break; } - /* XXX stats */ - nfsadvlocks++; - microuptime(&elapsed); - timevalsub(&elapsed, &ut->uu_nlminfo->nlm_lockstart); - if (timevalcmp(&elapsed, &nfsadvlock_longest, >)) - nfsadvlock_longest = elapsed; - timevaladd(&nfsadvlocks_time, &elapsed); - timerclear(&ut->uu_nlminfo->nlm_lockstart); - - error1 = vn_close(wvp, FWRITE, kernproc->p_ucred, p); - /* prefer any previous 'error' to our vn_close 'error1'. */ - return (error != 0 ? error : error1); + nfs_lockdmsg_dequeue(msgreq); + + lck_mtx_unlock(nfs_lock_mutex); + + return (error); } /* - * nfslockdans -- - * NFS advisory byte-level locks answer from the lock daemon. + * Send an NLM LOCK message to the server */ int -nfslockdans(struct proc *p, struct lockd_ans *ansp) +nfs3_setlock_rpc( + nfsnode_t np, + struct nfs_open_file *nofp, + struct nfs_file_lock *nflp, + int reclaim, + int flags, + thread_t thd, + kauth_cred_t cred) { - struct proc *targetp; - struct uthread *targetut, *uth; + struct nfs_lock_owner *nlop = nflp->nfl_owner; + struct nfsmount *nmp; int error; + LOCKD_MSG_REQUEST msgreq; + LOCKD_MSG *msg; - /* - * Let root, or someone who once was root (lockd generally - * switches to the daemon uid once it is done setting up) make - * this call. - * - * XXX This authorization check is probably not right. - */ - if ((error = suser(p->p_ucred, &p->p_acflag)) != 0 && - p->p_cred->p_svuid != 0) + nmp = NFSTONMP(np); + if (nfs_mount_gone(nmp)) + return (ENXIO); + + if (!nlop->nlo_open_owner) { + nfs_open_owner_ref(nofp->nof_owner); + nlop->nlo_open_owner = nofp->nof_owner; + } + if ((error = nfs_lock_owner_set_busy(nlop, thd))) return (error); - /* the version should match, or we're out of sync */ - if (ansp->la_vers != LOCKD_ANS_VERSION) - return (EINVAL); + /* set up lock message request structure */ + bzero(&msgreq, sizeof(msgreq)); + msg = &msgreq.lmr_msg; + msg->lm_version = LOCKD_MSG_VERSION; + if ((nflp->nfl_flags & NFS_FILE_LOCK_WAIT) && !reclaim) + msg->lm_flags |= LOCKD_MSG_BLOCK; + if (reclaim) + msg->lm_flags |= LOCKD_MSG_RECLAIM; + msg->lm_fh_len = (nmp->nm_vers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; + bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); + cru2x(cred, &msg->lm_cred); + + msg->lm_fl.l_whence = SEEK_SET; + msg->lm_fl.l_start = nflp->nfl_start; + msg->lm_fl.l_len = NFS_FLOCK_LENGTH(nflp->nfl_start, nflp->nfl_end); + msg->lm_fl.l_type = nflp->nfl_type; + msg->lm_fl.l_pid = nlop->nlo_pid; + + error = nfs3_lockd_request(np, 0, &msgreq, flags, thd); + + nfs_lock_owner_clear_busy(nlop); + return (error); +} - /* Find the process & thread */ - if ((targetp = pfind(ansp->la_msg_ident.pid)) == NULL) - return (ESRCH); - targetut = ansp->la_msg_ident.ut; - TAILQ_FOREACH(uth, &targetp->p_uthlist, uu_list) { - if (uth == targetut) - break; - } - /* - * Verify the pid hasn't been reused (if we can), and it isn't waiting - * for an answer from a more recent request. We return an EPIPE if - * the match fails, because we've already used ESRCH above, and this - * is sort of like writing on a pipe after the reader has closed it. - * If only the seq# is off, don't return an error just return. It could - * just be a response to a retransmitted request. - */ - if (uth == NULL || uth != targetut || targetut->uu_nlminfo == NULL) - return (EPIPE); - if (ansp->la_msg_ident.msg_seq != -1) { - if (timevalcmp(&targetut->uu_nlminfo->pid_start, - &ansp->la_msg_ident.pid_start, !=)) - return (EPIPE); - if (targetut->uu_nlminfo->msg_seq != ansp->la_msg_ident.msg_seq) - return (0); - } +/* + * Send an NLM UNLOCK message to the server + */ +int +nfs3_unlock_rpc( + nfsnode_t np, + struct nfs_lock_owner *nlop, + __unused int type, + uint64_t start, + uint64_t end, + int flags, + thread_t thd, + kauth_cred_t cred) +{ + struct nfsmount *nmp; + LOCKD_MSG_REQUEST msgreq; + LOCKD_MSG *msg; - /* Found the thread, so set its return errno and wake it up. */ + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); - targetut->uu_nlminfo->retcode = ansp->la_errno; - targetut->uu_nlminfo->set_getlk = ansp->la_getlk_set; - targetut->uu_nlminfo->getlk_pid = ansp->la_getlk_pid; - targetut->uu_nlminfo->getlk_start = ansp->la_getlk_start; - targetut->uu_nlminfo->getlk_len = ansp->la_getlk_len; + /* set up lock message request structure */ + bzero(&msgreq, sizeof(msgreq)); + msg = &msgreq.lmr_msg; + msg->lm_version = LOCKD_MSG_VERSION; + msg->lm_fh_len = (nmp->nm_vers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; + bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); + cru2x(cred, &msg->lm_cred); + + msg->lm_fl.l_whence = SEEK_SET; + msg->lm_fl.l_start = start; + msg->lm_fl.l_len = NFS_FLOCK_LENGTH(start, end); + msg->lm_fl.l_type = F_UNLCK; + msg->lm_fl.l_pid = nlop->nlo_pid; + + return (nfs3_lockd_request(np, F_UNLCK, &msgreq, flags, thd)); +} - (void)wakeup((void *)targetut->uu_nlminfo); +/* + * Send an NLM LOCK TEST message to the server + */ +int +nfs3_getlock_rpc( + nfsnode_t np, + struct nfs_lock_owner *nlop, + struct flock *fl, + uint64_t start, + uint64_t end, + vfs_context_t ctx) +{ + struct nfsmount *nmp; + int error; + LOCKD_MSG_REQUEST msgreq; + LOCKD_MSG *msg; - return (0); + nmp = NFSTONMP(np); + if (nfs_mount_gone(nmp)) + return (ENXIO); + + /* set up lock message request structure */ + bzero(&msgreq, sizeof(msgreq)); + msg = &msgreq.lmr_msg; + msg->lm_version = LOCKD_MSG_VERSION; + msg->lm_flags |= LOCKD_MSG_TEST; + msg->lm_fh_len = (nmp->nm_vers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; + bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); + cru2x(vfs_context_ucred(ctx), &msg->lm_cred); + + msg->lm_fl.l_whence = SEEK_SET; + msg->lm_fl.l_start = start; + msg->lm_fl.l_len = NFS_FLOCK_LENGTH(start, end); + msg->lm_fl.l_type = fl->l_type; + msg->lm_fl.l_pid = nlop->nlo_pid; + + error = nfs3_lockd_request(np, 0, &msgreq, 0, vfs_context_thread(ctx)); + + if (!error && (msg->lm_flags & LOCKD_MSG_TEST) && !msgreq.lmr_errno) { + if (msg->lm_fl.l_type != F_UNLCK) { + fl->l_type = msg->lm_fl.l_type; + fl->l_pid = msg->lm_fl.l_pid; + fl->l_start = msg->lm_fl.l_start; + fl->l_len = msg->lm_fl.l_len; + fl->l_whence = SEEK_SET; + } else + fl->l_type = F_UNLCK; + } + + return (error); } /* - * nfslockdfd -- - * NFS advisory byte-level locks: fifo file# from the lock daemon. + * nfslockdans -- + * NFS advisory byte-level locks answer from the lock daemon. */ int -nfslockdfd(struct proc *p, int fd) +nfslockdans(proc_t p, struct lockd_ans *ansp) { + LOCKD_MSG_REQUEST *msgreq; int error; - struct file *fp, *ofp; - error = suser(p->p_ucred, &p->p_acflag); + /* Let root make this call. */ + error = proc_suser(p); if (error) return (error); - if (fd < 0) { - fp = 0; - } else { - error = getvnode(p, fd, &fp); - if (error) - return (error); - (void)fref(fp); + + /* the version should match, or we're out of sync */ + if (ansp->la_version != LOCKD_ANS_VERSION) + return (EINVAL); + + lck_mtx_lock(nfs_lock_mutex); + + /* try to find the lockd message by transaction id (cookie) */ + msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid); + if (ansp->la_flags & LOCKD_ANS_GRANTED) { + /* + * We can't depend on the granted message having our cookie, + * so we check the answer against the lockd message found. + * If no message was found or it doesn't match the answer, + * we look for the lockd message by the answer's lock info. + */ + if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp)) + msgreq = nfs_lockdmsg_find_by_answer(ansp); + /* + * We need to make sure this request isn't being cancelled + * If it is, we don't want to accept the granted message. + */ + if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL)) + msgreq = NULL; + } + if (!msgreq) { + lck_mtx_unlock(nfs_lock_mutex); + return (EPIPE); + } + + msgreq->lmr_errno = ansp->la_errno; + if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) { + if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) { + if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL) + msgreq->lmr_msg.lm_fl.l_type = F_WRLCK; + else + msgreq->lmr_msg.lm_fl.l_type = F_RDLCK; + msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid; + msgreq->lmr_msg.lm_fl.l_start = ansp->la_start; + msgreq->lmr_msg.lm_fl.l_len = ansp->la_len; + } else { + msgreq->lmr_msg.lm_fl.l_type = F_UNLCK; + } } - ofp = nfslockdfp; - nfslockdfp = fp; - if (ofp) - (void)frele(ofp); - nfslockdpid = nfslockdfp ? p->p_pid : 0; - (void)wakeup((void *)&nfslockdfp); + if (ansp->la_flags & LOCKD_ANS_DENIED_GRACE) + msgreq->lmr_msg.lm_flags |= LOCKD_MSG_DENIED_GRACE; + + msgreq->lmr_answered = 1; + lck_mtx_unlock(nfs_lock_mutex); + wakeup(msgreq); + return (0); } /* - * nfslockdwait -- - * lock daemon waiting for lock request + * nfslockdnotify -- + * NFS host restart notification from the lock daemon. + * + * Used to initiate reclaiming of held locks when a server we + * have mounted reboots. */ int -nfslockdwait(struct proc *p) +nfslockdnotify(proc_t p, user_addr_t argp) { - int error; - struct file *fp, *ofp; + int error, i, headsize; + struct lockd_notify ln; + struct nfsmount *nmp; + struct sockaddr *saddr; - if (p->p_pid != nfslockdpid) { - error = suser(p->p_ucred, &p->p_acflag); + /* Let root make this call. */ + error = proc_suser(p); + if (error) + return (error); + + headsize = (char*)&ln.ln_addr[0] - (char*)&ln.ln_version; + error = copyin(argp, &ln, headsize); + if (error) + return (error); + if (ln.ln_version != LOCKD_NOTIFY_VERSION) + return (EINVAL); + if ((ln.ln_addrcount < 1) || (ln.ln_addrcount > 128)) + return (EINVAL); + argp += headsize; + saddr = (struct sockaddr *)&ln.ln_addr[0]; + + lck_mtx_lock(nfs_lock_mutex); + + for (i=0; i < ln.ln_addrcount; i++) { + error = copyin(argp, &ln.ln_addr[0], sizeof(ln.ln_addr[0])); if (error) - return (error); - } - if (nfslockdwaiting) - return (EBUSY); - if (nfslockdfifowritten) { - nfslockdfifowritten = 0; - return (0); + break; + argp += sizeof(ln.ln_addr[0]); + /* scan lockd mount list for match to this address */ + TAILQ_FOREACH(nmp, &nfs_lockd_mount_list, nm_ldlink) { + /* check if address matches this mount's server address */ + if (!nmp->nm_saddr || nfs_sockaddr_cmp(saddr, nmp->nm_saddr)) + continue; + /* We have a match! Mark it as needing recovery. */ + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, 0); + lck_mtx_unlock(&nmp->nm_lock); + } } - nfslockdwaiting = 1; - tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0); - nfslockdwaiting = 0; + lck_mtx_unlock(nfs_lock_mutex); - return (0); + return (error); } +