[apple/xnu.git] / bsd / nfs / nfs_lock.c

/*
 * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/*-
 * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Berkeley Software Design Inc's name may not be used to endorse or
 *    promote products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *      from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
 */

#include <sys/cdefs.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>		/* for hz */
#include <sys/file.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/lockf.h>		/* for hz */ /* Must come after sys/malloc.h */
#include <sys/mbuf.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/socket.h>
#include <sys/socket.h>
#include <sys/unistd.h>
#include <sys/user.h>
#include <sys/vnode.h>

#include <kern/thread_act.h>

#include <machine/limits.h>

#include <net/if.h>

#include <nfs/rpcv2.h>
#include <nfs/nfsproto.h>
#include <nfs/nfs.h>
#include <nfs/nfsmount.h>
#include <nfs/nfsnode.h>
#include <nfs/nfs_lock.h>
#include <nfs/nlminfo.h>

#define OFF_MAX QUAD_MAX

uint64_t nfsadvlocks = 0;
struct timeval nfsadvlock_longest = {0, 0};
struct timeval nfsadvlocks_time = {0, 0};

pid_t nfslockdpid = 0;
struct file *nfslockdfp = 0;
int nfslockdwaiting = 0;
int nfslockdfifowritten = 0;
int nfslockdfifolock = 0;
#define NFSLOCKDFIFOLOCK_LOCKED	1
#define NFSLOCKDFIFOLOCK_WANT	2

/*
 * XXX
 * We have to let the process know if the call succeeded.  I'm using an extra
 * field in the uu_nlminfo field in the uthread structure, as it is already for
 * lockd stuff.
 */

/*
 * nfs_advlock --
 *      NFS advisory byte-level locks.
 */
int
nfs_dolock(struct vop_advlock_args *ap)
/* struct vop_advlock_args {
        struct vnodeop_desc *a_desc;
        struct vnode *a_vp;
        caddr_t a_id;
        int a_op;
        struct flock *a_fl;
        int a_flags;
}; */
{
	LOCKD_MSG msg;
	struct nameidata nd;
	struct vnode *vp, *wvp;
	struct nfsnode *np;
	int error, error1;
	struct flock *fl;
	int fmode, ioflg;
	struct proc *p;
        struct uthread *ut;
	struct timeval elapsed;
	struct nfsmount *nmp;
	struct vattr vattr;
	off_t start, end;

        ut = get_bsdthread_info(current_act());
	p = current_proc();

	vp = ap->a_vp;
	fl = ap->a_fl;
	np = VTONFS(vp);

	nmp = VFSTONFS(vp->v_mount);
	if (!nmp)
		return (ENXIO);
	if (nmp->nm_flag & NFSMNT_NOLOCKS)
		return (EOPNOTSUPP);

	/*
	 * The NLM protocol doesn't allow the server to return an error
	 * on ranges, so we do it.  Pre LFS (Large File Summit)
	 * standards required EINVAL for the range errors.  More recent
	 * standards use EOVERFLOW, but their EINVAL wording still
	 * encompasses these errors.
	 * Any code sensitive to this is either:
	 *  1) written pre-LFS and so can handle only EINVAL, or
	 *  2) written post-LFS and thus ought to be tolerant of pre-LFS
	 *     implementations.
	 * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
	 */
	if (fl->l_whence != SEEK_END) {
		if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
		    fl->l_start < 0 ||
		    (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
		    (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
			return (EINVAL);
	}
	/*
	 * If daemon is running take a ref on its fifo
	 */
	if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) {
		if (!nfslockdwaiting)
			return (EOPNOTSUPP);
		/*
		 * Don't wake lock daemon if it hasn't been started yet and
		 * this is an unlock request (since we couldn't possibly
		 * actually have a lock on the file).  This could be an
		 * uninformed unlock request due to closef()'s behavior of doing
		 * unlocks on all files if a process has had a lock on ANY file.
		 */
		if (!nfslockdfp && (fl->l_type == F_UNLCK))
			return (EINVAL);
		/* wake up lock daemon */
		(void)wakeup((void *)&nfslockdwaiting);
		/* wait on nfslockdfp for a while to allow daemon to start */
		tsleep((void *)&nfslockdfp, PCATCH | PUSER, "lockd", 60*hz);
		/* check for nfslockdfp and f_data */
		if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data))
			return (EOPNOTSUPP);
	}
	VREF(wvp);
	/*
	 * if there is no nfsowner table yet, allocate one.
	 */
	if (ut->uu_nlminfo == NULL) {
		if (ap->a_op == F_UNLCK) {
			vrele(wvp);
			return (0);
		}
		MALLOC(ut->uu_nlminfo, struct nlminfo *,
			sizeof(struct nlminfo), M_LOCKF, M_WAITOK | M_ZERO);
		ut->uu_nlminfo->pid_start = p->p_stats->p_start;
	}
	/*
	 * Fill in the information structure.
	 */
	msg.lm_version = LOCKD_MSG_VERSION;
	msg.lm_msg_ident.pid = p->p_pid;
	msg.lm_msg_ident.ut = ut;
	msg.lm_msg_ident.pid_start = ut->uu_nlminfo->pid_start;
	msg.lm_msg_ident.msg_seq = ++(ut->uu_nlminfo->msg_seq);

	/*
	 * The NFS Lock Manager protocol doesn't directly handle
	 * negative lengths or SEEK_END, so we need to normalize
	 * things here where we have all the info.
	 * (Note: SEEK_CUR is already adjusted for at this point)
	 */
	/* Convert the flock structure into a start and end. */
	switch (fl->l_whence) {
	case SEEK_SET:
	case SEEK_CUR:
		/*
		 * Caller is responsible for adding any necessary offset
		 * to fl->l_start when SEEK_CUR is used.
		 */
		start = fl->l_start;
		break;
	case SEEK_END:
		/* need to flush, and refetch attributes to make */
		/* sure we have the correct end of file offset   */
		if (np->n_flag & NMODIFIED) {
			np->n_attrstamp = 0;
			error = nfs_vinvalbuf(vp, V_SAVE, p->p_ucred, p, 1);
			if (error) {
				vrele(wvp);
				return (error);
			}
		}
		np->n_attrstamp = 0;
		error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
		if (error) {
			vrele(wvp);
			return (error);
		}
		start = np->n_size + fl->l_start;
		break;
	default:
		vrele(wvp);
		return (EINVAL);
	}
	if (fl->l_len == 0)
		end = -1;
	else if (fl->l_len > 0)
		end = start + fl->l_len - 1;
	else { /* l_len is negative */
		end = start - 1;
		start += fl->l_len;
	}
	if (start < 0) {
		vrele(wvp);
		return (EINVAL);
	}

	msg.lm_fl = *fl;
	msg.lm_fl.l_start = start;
	if (end != -1)
		msg.lm_fl.l_len = end - start + 1;

	msg.lm_wait = ap->a_flags & F_WAIT;
	msg.lm_getlk = ap->a_op == F_GETLK;

	nmp = VFSTONFS(vp->v_mount);
	if (!nmp) {
		vrele(wvp);
		return (ENXIO);
	}

	bcopy(mtod(nmp->nm_nam, struct sockaddr *), &msg.lm_addr,
	      min(sizeof msg.lm_addr,
		  mtod(nmp->nm_nam, struct sockaddr *)->sa_len));
	msg.lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
	bcopy(VTONFS(vp)->n_fhp, msg.lm_fh, msg.lm_fh_len);
	msg.lm_nfsv3 = NFS_ISV3(vp);
	cru2x(p->p_ucred, &msg.lm_cred);

	microuptime(&ut->uu_nlminfo->nlm_lockstart);

	fmode = FFLAGS(O_WRONLY);
	if ((error = VOP_OPEN(wvp, fmode, kernproc->p_ucred, p))) {
		vrele(wvp);
		return (error);
	}
	++wvp->v_writecount;

#define IO_NOMACCHECK 0;
	ioflg = IO_UNIT | IO_NOMACCHECK;
	for (;;) {
		VOP_LEASE(wvp, p, kernproc->p_ucred, LEASE_WRITE);

		while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) {
			nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT;
			if (tsleep((void *)&nfslockdfifolock, PCATCH | PUSER, "lockdfifo", 20*hz))
				break;
		}
		nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED;

		error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)&msg, sizeof(msg), 0,
		    UIO_SYSSPACE, ioflg, kernproc->p_ucred, NULL, p);

		nfslockdfifowritten = 1;

		nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED;
		if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) {
			nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT;
			wakeup((void *)&nfslockdfifolock);
		}
		/* wake up lock daemon */
		if (nfslockdwaiting)
			(void)wakeup((void *)&nfslockdwaiting);

		if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) {
			break;
		}
		/*
		 * If we're locking a file, wait for an answer.  Unlocks succeed
		 * immediately.
		 */
		if (fl->l_type == F_UNLCK)
			/*
			 * XXX this isn't exactly correct.  The client side
			 * needs to continue sending it's unlock until
			 * it gets a response back.
			 */
			break;

		/*
		 * retry after 20 seconds if we haven't gotten a response yet.
		 * This number was picked out of thin air... but is longer
		 * then even a reasonably loaded system should take (at least
		 * on a local network).  XXX Probably should use a back-off
		 * scheme.
		 */
		if ((error = tsleep((void *)ut->uu_nlminfo,
				    PCATCH | PUSER, "lockd", 20*hz)) != 0) {
			if (error == EWOULDBLOCK) {
				/*
				 * We timed out, so we rewrite the request
				 * to the fifo, but only if it isn't already
				 * full.
				 */
				ioflg |= IO_NDELAY;
				continue;
			}

			break;
		}

		if (msg.lm_getlk && ut->uu_nlminfo->retcode == 0) {
			if (ut->uu_nlminfo->set_getlk) {
				fl->l_pid = ut->uu_nlminfo->getlk_pid;
				fl->l_start = ut->uu_nlminfo->getlk_start;
				fl->l_len = ut->uu_nlminfo->getlk_len;
				fl->l_whence = SEEK_SET;
			} else {
				fl->l_type = F_UNLCK;
			}
		}
		error = ut->uu_nlminfo->retcode;
		break;
	}

	/* XXX stats */
	nfsadvlocks++;
	microuptime(&elapsed);
	timevalsub(&elapsed, &ut->uu_nlminfo->nlm_lockstart);
	if (timevalcmp(&elapsed, &nfsadvlock_longest, >))
		nfsadvlock_longest = elapsed;
	timevaladd(&nfsadvlocks_time, &elapsed);
	timerclear(&ut->uu_nlminfo->nlm_lockstart);

	error1 = vn_close(wvp, FWRITE, kernproc->p_ucred, p);
	/* prefer any previous 'error' to our vn_close 'error1'. */
	return (error != 0 ? error : error1);
}

/*
 * nfslockdans --
 *      NFS advisory byte-level locks answer from the lock daemon.
 */
int
nfslockdans(struct proc *p, struct lockd_ans *ansp)
{
	struct proc *targetp;
	struct uthread *targetut, *uth;
	int error;

	/*
	 * Let root, or someone who once was root (lockd generally
	 * switches to the daemon uid once it is done setting up) make
	 * this call.
	 *
	 * XXX This authorization check is probably not right.
	 */
	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0 &&
	    p->p_cred->p_svuid != 0)
		return (error);

	/* the version should match, or we're out of sync */
	if (ansp->la_vers != LOCKD_ANS_VERSION)
		return (EINVAL);

	/* Find the process & thread */
	if ((targetp = pfind(ansp->la_msg_ident.pid)) == NULL)
		return (ESRCH);
	targetut = ansp->la_msg_ident.ut;
	TAILQ_FOREACH(uth, &targetp->p_uthlist, uu_list) {
		if (uth == targetut)
			break;
	}
	/*
	 * Verify the pid hasn't been reused (if we can), and it isn't waiting
	 * for an answer from a more recent request.  We return an EPIPE if
	 * the match fails, because we've already used ESRCH above, and this
	 * is sort of like writing on a pipe after the reader has closed it.
	 * If only the seq# is off, don't return an error just return.  It could
	 * just be a response to a retransmitted request.
	 */
	if (uth == NULL || uth != targetut || targetut->uu_nlminfo == NULL)
		return (EPIPE);
	if (ansp->la_msg_ident.msg_seq != -1) {
		if (timevalcmp(&targetut->uu_nlminfo->pid_start,
		               &ansp->la_msg_ident.pid_start, !=))
			return (EPIPE);
		if (targetut->uu_nlminfo->msg_seq != ansp->la_msg_ident.msg_seq)
			return (0);
	}

	/* Found the thread, so set its return errno and wake it up. */

	targetut->uu_nlminfo->retcode = ansp->la_errno;
	targetut->uu_nlminfo->set_getlk = ansp->la_getlk_set;
	targetut->uu_nlminfo->getlk_pid = ansp->la_getlk_pid;
	targetut->uu_nlminfo->getlk_start = ansp->la_getlk_start;
	targetut->uu_nlminfo->getlk_len = ansp->la_getlk_len;

	(void)wakeup((void *)targetut->uu_nlminfo);

	return (0);
}

/*
 * nfslockdfd --
 *      NFS advisory byte-level locks: fifo file# from the lock daemon.
 */
int
nfslockdfd(struct proc *p, int fd)
{
	int error;
	struct file *fp, *ofp;

	error = suser(p->p_ucred, &p->p_acflag);
	if (error)
		return (error);
	if (fd < 0) {
		fp = 0;
	} else {
		error = getvnode(p, fd, &fp);
		if (error)
			return (error);
		(void)fref(fp);
	}
	ofp = nfslockdfp;
	nfslockdfp = fp;
	if (ofp)
		(void)frele(ofp);
	nfslockdpid = nfslockdfp ? p->p_pid : 0;
	(void)wakeup((void *)&nfslockdfp);
	return (0);
}

/*
 * nfslockdwait --
 *      lock daemon waiting for lock request
 */
int
nfslockdwait(struct proc *p)
{
	int error;
	struct file *fp, *ofp;

	if (p->p_pid != nfslockdpid) {
		error = suser(p->p_ucred, &p->p_acflag);
		if (error)
			return (error);
	}
	if (nfslockdwaiting)
		return (EBUSY);
	if (nfslockdfifowritten) {
		nfslockdfifowritten = 0;
		return (0);
	}

	nfslockdwaiting = 1;
	tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0);
	nfslockdwaiting = 0;

	return (0);
}
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
	7	*
	8	* This file contains Original Code and/or Modifications of Original Code
	9	* as defined in and that are subject to the Apple Public Source License
	10	* Version 2.0 (the 'License'). You may not use this file except in
	11	* compliance with the License. Please obtain a copy of the License at
	12	* http://www.opensource.apple.com/apsl/ and read it before using this
	13	* file.
	14	*
	15	* The Original Code and all software distributed under the License are
	16	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	17	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	18	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	19	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	20	* Please see the License for the specific language governing rights and
	21	* limitations under the License.
	22	*
	23	* @APPLE_LICENSE_HEADER_END@
	24	*/
	25	/*-
	26	* Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
	27	*
	28	* Redistribution and use in source and binary forms, with or without
	29	* modification, are permitted provided that the following conditions
	30	* are met:
	31	* 1. Redistributions of source code must retain the above copyright
	32	* notice, this list of conditions and the following disclaimer.
	33	* 2. Redistributions in binary form must reproduce the above copyright
	34	* notice, this list of conditions and the following disclaimer in the
	35	* documentation and/or other materials provided with the distribution.
	36	* 3. Berkeley Software Design Inc's name may not be used to endorse or
	37	* promote products derived from this software without specific prior
	38	* written permission.
	39	*
	40	* THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
	41	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	42	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	43	* ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
	44	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	45	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	46	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	47	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	48	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	49	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	50	* SUCH DAMAGE.
	51	*
	52	* from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
	53	*/
	54
	55	#include <sys/cdefs.h>
	56	#include <sys/param.h>
	57	#include <sys/systm.h>
	58	#include <sys/fcntl.h>
	59	#include <sys/kernel.h> /* for hz */
	60	#include <sys/file.h>
	61	#include <sys/lock.h>
	62	#include <sys/malloc.h>
	63	#include <sys/lockf.h> /* for hz / / Must come after sys/malloc.h */
	64	#include <sys/mbuf.h>
65	#include <sys/mount.h>
66	#include <sys/namei.h>
67	#include <sys/proc.h>
68	#include <sys/resourcevar.h>
69	#include <sys/socket.h>
70	#include <sys/socket.h>
71	#include <sys/unistd.h>
72	#include <sys/user.h>
73	#include <sys/vnode.h>
74
75	#include <kern/thread_act.h>
76
77	#include <machine/limits.h>
78
79	#include <net/if.h>
80
81	#include <nfs/rpcv2.h>
82	#include <nfs/nfsproto.h>
83	#include <nfs/nfs.h>
84	#include <nfs/nfsmount.h>
85	#include <nfs/nfsnode.h>
86	#include <nfs/nfs_lock.h>
87	#include <nfs/nlminfo.h>
88
89	#define OFF_MAX QUAD_MAX
90
91	uint64_t nfsadvlocks = 0;
92	struct timeval nfsadvlock_longest = {0, 0};
93	struct timeval nfsadvlocks_time = {0, 0};
94
95	pid_t nfslockdpid = 0;
96	struct file *nfslockdfp = 0;
97	int nfslockdwaiting = 0;
98	int nfslockdfifowritten = 0;
99	int nfslockdfifolock = 0;
100	#define NFSLOCKDFIFOLOCK_LOCKED 1
101	#define NFSLOCKDFIFOLOCK_WANT 2
102
103	/*
104	* XXX
105	* We have to let the process know if the call succeeded. I'm using an extra
106	* field in the uu_nlminfo field in the uthread structure, as it is already for
107	* lockd stuff.
108	*/
109
110	/*
111	* nfs_advlock --
112	* NFS advisory byte-level locks.
113	*/
114	int
115	nfs_dolock(struct vop_advlock_args *ap)
116	/* struct vop_advlock_args {
117	struct vnodeop_desc *a_desc;
118	struct vnode *a_vp;
119	caddr_t a_id;
120	int a_op;
121	struct flock *a_fl;
122	int a_flags;
123	}; */
124	{
125	LOCKD_MSG msg;
126	struct nameidata nd;
127	struct vnode vp, wvp;
128	struct nfsnode *np;
129	int error, error1;
130	struct flock *fl;
131	int fmode, ioflg;
132	struct proc *p;
133	struct uthread *ut;
134	struct timeval elapsed;
135	struct nfsmount *nmp;
136	struct vattr vattr;
137	off_t start, end;
138
139	ut = get_bsdthread_info(current_act());
140	p = current_proc();
141
142	vp = ap->a_vp;
143	fl = ap->a_fl;
144	np = VTONFS(vp);
145
146	nmp = VFSTONFS(vp->v_mount);
147	if (!nmp)
148	return (ENXIO);
149	if (nmp->nm_flag & NFSMNT_NOLOCKS)
150	return (EOPNOTSUPP);
151
152	/*
153	* The NLM protocol doesn't allow the server to return an error
154	* on ranges, so we do it. Pre LFS (Large File Summit)
155	* standards required EINVAL for the range errors. More recent
156	* standards use EOVERFLOW, but their EINVAL wording still
157	* encompasses these errors.
158	* Any code sensitive to this is either:
159	* 1) written pre-LFS and so can handle only EINVAL, or
160	* 2) written post-LFS and thus ought to be tolerant of pre-LFS
161	* implementations.
162	* Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
163	*/
164	if (fl->l_whence != SEEK_END) {
165	if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) \|\|
166	fl->l_start < 0 \|\|
167	(fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) \|\|
168	(fl->l_len < 0 && fl->l_start + fl->l_len < 0))
169	return (EINVAL);
170	}
171	/*
172	* If daemon is running take a ref on its fifo
173	*/
174	if (!nfslockdfp \|\| !(wvp = (struct vnode *)nfslockdfp->f_data)) {
175	if (!nfslockdwaiting)
176	return (EOPNOTSUPP);
177	/*
178	* Don't wake lock daemon if it hasn't been started yet and
179	* this is an unlock request (since we couldn't possibly
180	* actually have a lock on the file). This could be an
181	* uninformed unlock request due to closef()'s behavior of doing
182	* unlocks on all files if a process has had a lock on ANY file.
183	*/
184	if (!nfslockdfp && (fl->l_type == F_UNLCK))
185	return (EINVAL);
186	/* wake up lock daemon */
187	(void)wakeup((void *)&nfslockdwaiting);
188	/* wait on nfslockdfp for a while to allow daemon to start */
189	tsleep((void )&nfslockdfp, PCATCH \| PUSER, "lockd", 60hz);
190	/* check for nfslockdfp and f_data */
191	if (!nfslockdfp \|\| !(wvp = (struct vnode *)nfslockdfp->f_data))
192	return (EOPNOTSUPP);
193	}
194	VREF(wvp);
195	/*
196	* if there is no nfsowner table yet, allocate one.
197	*/
198	if (ut->uu_nlminfo == NULL) {
199	if (ap->a_op == F_UNLCK) {
200	vrele(wvp);
201	return (0);
202	}
203	MALLOC(ut->uu_nlminfo, struct nlminfo *,
204	sizeof(struct nlminfo), M_LOCKF, M_WAITOK \| M_ZERO);
205	ut->uu_nlminfo->pid_start = p->p_stats->p_start;
206	}
207	/*
208	* Fill in the information structure.
209	*/
210	msg.lm_version = LOCKD_MSG_VERSION;
211	msg.lm_msg_ident.pid = p->p_pid;
212	msg.lm_msg_ident.ut = ut;
213	msg.lm_msg_ident.pid_start = ut->uu_nlminfo->pid_start;
214	msg.lm_msg_ident.msg_seq = ++(ut->uu_nlminfo->msg_seq);
215
216	/*
217	* The NFS Lock Manager protocol doesn't directly handle
218	* negative lengths or SEEK_END, so we need to normalize
219	* things here where we have all the info.
220	* (Note: SEEK_CUR is already adjusted for at this point)
221	*/
222	/* Convert the flock structure into a start and end. */
223	switch (fl->l_whence) {
224	case SEEK_SET:
225	case SEEK_CUR:
226	/*
227	* Caller is responsible for adding any necessary offset
228	* to fl->l_start when SEEK_CUR is used.
229	*/
230	start = fl->l_start;
231	break;
232	case SEEK_END:
233	/* need to flush, and refetch attributes to make */
234	/* sure we have the correct end of file offset */
235	if (np->n_flag & NMODIFIED) {
236	np->n_attrstamp = 0;
237	error = nfs_vinvalbuf(vp, V_SAVE, p->p_ucred, p, 1);
238	if (error) {
239	vrele(wvp);
240	return (error);
241	}
242	}
243	np->n_attrstamp = 0;
244	error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
245	if (error) {
246	vrele(wvp);
247	return (error);
248	}
249	start = np->n_size + fl->l_start;
250	break;
251	default:
252	vrele(wvp);
253	return (EINVAL);
254	}
255	if (fl->l_len == 0)
256	end = -1;
257	else if (fl->l_len > 0)
258	end = start + fl->l_len - 1;
259	else { /* l_len is negative */
260	end = start - 1;
261	start += fl->l_len;
262	}
263	if (start < 0) {
264	vrele(wvp);
265	return (EINVAL);
266	}
267
268	msg.lm_fl = *fl;
269	msg.lm_fl.l_start = start;
270	if (end != -1)
271	msg.lm_fl.l_len = end - start + 1;
272
273	msg.lm_wait = ap->a_flags & F_WAIT;
274	msg.lm_getlk = ap->a_op == F_GETLK;
275
276	nmp = VFSTONFS(vp->v_mount);
277	if (!nmp) {
278	vrele(wvp);
279	return (ENXIO);
280	}
281
282	bcopy(mtod(nmp->nm_nam, struct sockaddr *), &msg.lm_addr,
283	min(sizeof msg.lm_addr,
284	mtod(nmp->nm_nam, struct sockaddr *)->sa_len));
285	msg.lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
286	bcopy(VTONFS(vp)->n_fhp, msg.lm_fh, msg.lm_fh_len);
287	msg.lm_nfsv3 = NFS_ISV3(vp);
288	cru2x(p->p_ucred, &msg.lm_cred);
289
290	microuptime(&ut->uu_nlminfo->nlm_lockstart);
291
292	fmode = FFLAGS(O_WRONLY);
293	if ((error = VOP_OPEN(wvp, fmode, kernproc->p_ucred, p))) {
294	vrele(wvp);
295	return (error);
296	}
297	++wvp->v_writecount;
298
299	#define IO_NOMACCHECK 0;
300	ioflg = IO_UNIT \| IO_NOMACCHECK;
301	for (;;) {
302	VOP_LEASE(wvp, p, kernproc->p_ucred, LEASE_WRITE);
303
304	while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) {
305	nfslockdfifolock \|= NFSLOCKDFIFOLOCK_WANT;
306	if (tsleep((void )&nfslockdfifolock, PCATCH \| PUSER, "lockdfifo", 20hz))
307	break;
308	}
309	nfslockdfifolock \|= NFSLOCKDFIFOLOCK_LOCKED;
310
311	error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)&msg, sizeof(msg), 0,
312	UIO_SYSSPACE, ioflg, kernproc->p_ucred, NULL, p);
313
314	nfslockdfifowritten = 1;
315
316	nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED;
317	if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) {
318	nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT;
319	wakeup((void *)&nfslockdfifolock);
320	}
321	/* wake up lock daemon */
322	if (nfslockdwaiting)
323	(void)wakeup((void *)&nfslockdwaiting);
324
325	if (error && (((ioflg & IO_NDELAY) == 0) \|\| error != EAGAIN)) {
326	break;
327	}
328	/*
329	* If we're locking a file, wait for an answer. Unlocks succeed
330	* immediately.
331	*/
332	if (fl->l_type == F_UNLCK)
333	/*
334	* XXX this isn't exactly correct. The client side
335	* needs to continue sending it's unlock until
336	* it gets a response back.
337	*/
338	break;
339
340	/*
341	* retry after 20 seconds if we haven't gotten a response yet.
342	* This number was picked out of thin air... but is longer
343	* then even a reasonably loaded system should take (at least
344	* on a local network). XXX Probably should use a back-off
345	* scheme.
346	*/
347	if ((error = tsleep((void *)ut->uu_nlminfo,
348	PCATCH \| PUSER, "lockd", 20*hz)) != 0) {
349	if (error == EWOULDBLOCK) {
350	/*
351	* We timed out, so we rewrite the request
352	* to the fifo, but only if it isn't already
353	* full.
354	*/
355	ioflg \|= IO_NDELAY;
356	continue;
357	}
358
359	break;
360	}
361
362	if (msg.lm_getlk && ut->uu_nlminfo->retcode == 0) {
363	if (ut->uu_nlminfo->set_getlk) {
364	fl->l_pid = ut->uu_nlminfo->getlk_pid;
365	fl->l_start = ut->uu_nlminfo->getlk_start;
366	fl->l_len = ut->uu_nlminfo->getlk_len;
367	fl->l_whence = SEEK_SET;
368	} else {
369	fl->l_type = F_UNLCK;
370	}
371	}
372	error = ut->uu_nlminfo->retcode;
373	break;
374	}
375
376	/* XXX stats */
377	nfsadvlocks++;
378	microuptime(&elapsed);
379	timevalsub(&elapsed, &ut->uu_nlminfo->nlm_lockstart);
380	if (timevalcmp(&elapsed, &nfsadvlock_longest, >))
381	nfsadvlock_longest = elapsed;
382	timevaladd(&nfsadvlocks_time, &elapsed);
383	timerclear(&ut->uu_nlminfo->nlm_lockstart);
384
385	error1 = vn_close(wvp, FWRITE, kernproc->p_ucred, p);
386	/* prefer any previous 'error' to our vn_close 'error1'. */
387	return (error != 0 ? error : error1);
388	}
389
390	/*
391	* nfslockdans --
392	* NFS advisory byte-level locks answer from the lock daemon.
393	*/
394	int
395	nfslockdans(struct proc p, struct lockd_ans ansp)
396	{
397	struct proc *targetp;
398	struct uthread targetut, uth;
399	int error;
400
401	/*
402	* Let root, or someone who once was root (lockd generally
403	* switches to the daemon uid once it is done setting up) make
404	* this call.
405	*
406	* XXX This authorization check is probably not right.
407	*/
408	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0 &&
409	p->p_cred->p_svuid != 0)
410	return (error);
411
412	/* the version should match, or we're out of sync */
413	if (ansp->la_vers != LOCKD_ANS_VERSION)
414	return (EINVAL);
415
416	/* Find the process & thread */
417	if ((targetp = pfind(ansp->la_msg_ident.pid)) == NULL)
418	return (ESRCH);
419	targetut = ansp->la_msg_ident.ut;
420	TAILQ_FOREACH(uth, &targetp->p_uthlist, uu_list) {
421	if (uth == targetut)
422	break;
423	}
424	/*
425	* Verify the pid hasn't been reused (if we can), and it isn't waiting
426	* for an answer from a more recent request. We return an EPIPE if
427	* the match fails, because we've already used ESRCH above, and this
428	* is sort of like writing on a pipe after the reader has closed it.
429	* If only the seq# is off, don't return an error just return. It could
430	* just be a response to a retransmitted request.
431	*/
432	if (uth == NULL \|\| uth != targetut \|\| targetut->uu_nlminfo == NULL)
433	return (EPIPE);
434	if (ansp->la_msg_ident.msg_seq != -1) {
435	if (timevalcmp(&targetut->uu_nlminfo->pid_start,
436	&ansp->la_msg_ident.pid_start, !=))
437	return (EPIPE);
438	if (targetut->uu_nlminfo->msg_seq != ansp->la_msg_ident.msg_seq)
439	return (0);
440	}
441
442	/* Found the thread, so set its return errno and wake it up. */
443
444	targetut->uu_nlminfo->retcode = ansp->la_errno;
445	targetut->uu_nlminfo->set_getlk = ansp->la_getlk_set;
446	targetut->uu_nlminfo->getlk_pid = ansp->la_getlk_pid;
447	targetut->uu_nlminfo->getlk_start = ansp->la_getlk_start;
448	targetut->uu_nlminfo->getlk_len = ansp->la_getlk_len;
449
450	(void)wakeup((void *)targetut->uu_nlminfo);
451
452	return (0);
453	}
454
455	/*
456	* nfslockdfd --
457	* NFS advisory byte-level locks: fifo file# from the lock daemon.
458	*/
459	int
460	nfslockdfd(struct proc *p, int fd)
461	{
462	int error;
463	struct file fp, ofp;
464
465	error = suser(p->p_ucred, &p->p_acflag);
466	if (error)
467	return (error);
468	if (fd < 0) {
469	fp = 0;
470	} else {
471	error = getvnode(p, fd, &fp);
472	if (error)
473	return (error);
474	(void)fref(fp);
475	}
476	ofp = nfslockdfp;
477	nfslockdfp = fp;
478	if (ofp)
479	(void)frele(ofp);
480	nfslockdpid = nfslockdfp ? p->p_pid : 0;
481	(void)wakeup((void *)&nfslockdfp);
482	return (0);
483	}
484
485	/*
486	* nfslockdwait --
487	* lock daemon waiting for lock request
488	*/
489	int
490	nfslockdwait(struct proc *p)
491	{
492	int error;
493	struct file fp, ofp;
494
495	if (p->p_pid != nfslockdpid) {
496	error = suser(p->p_ucred, &p->p_acflag);
497	if (error)
498	return (error);
499	}
500	if (nfslockdwaiting)
501	return (EBUSY);
502	if (nfslockdfifowritten) {
503	nfslockdfifowritten = 0;
504	return (0);
505	}
506
507	nfslockdwaiting = 1;
508	tsleep((void *)&nfslockdwaiting, PCATCH \| PUSER, "lockd", 0);
509	nfslockdwaiting = 0;
510
511	return (0);
512	}