]> git.saurik.com Git - apple/xnu.git/blame - bsd/nfs/nfs_lock.c
xnu-517.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_lock.c
CommitLineData
55e303ae
A
1/*
2 * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25/*-
26 * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 * 1. Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * 2. Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in the
35 * documentation and/or other materials provided with the distribution.
36 * 3. Berkeley Software Design Inc's name may not be used to endorse or
37 * promote products derived from this software without specific prior
38 * written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
53 */
54
55#include <sys/cdefs.h>
56#include <sys/param.h>
57#include <sys/systm.h>
58#include <sys/fcntl.h>
59#include <sys/kernel.h> /* for hz */
60#include <sys/file.h>
61#include <sys/lock.h>
62#include <sys/malloc.h>
63#include <sys/lockf.h> /* for hz */ /* Must come after sys/malloc.h */
64#include <sys/mbuf.h>
65#include <sys/mount.h>
66#include <sys/namei.h>
67#include <sys/proc.h>
68#include <sys/resourcevar.h>
69#include <sys/socket.h>
70#include <sys/socket.h>
71#include <sys/unistd.h>
72#include <sys/user.h>
73#include <sys/vnode.h>
74
75#include <kern/thread_act.h>
76
77#include <machine/limits.h>
78
79#include <net/if.h>
80
81#include <nfs/rpcv2.h>
82#include <nfs/nfsproto.h>
83#include <nfs/nfs.h>
84#include <nfs/nfsmount.h>
85#include <nfs/nfsnode.h>
86#include <nfs/nfs_lock.h>
87#include <nfs/nlminfo.h>
88
89#define OFF_MAX QUAD_MAX
90
91uint64_t nfsadvlocks = 0;
92struct timeval nfsadvlock_longest = {0, 0};
93struct timeval nfsadvlocks_time = {0, 0};
94
95pid_t nfslockdpid = 0;
96struct file *nfslockdfp = 0;
97int nfslockdwaiting = 0;
98int nfslockdfifowritten = 0;
99int nfslockdfifolock = 0;
100#define NFSLOCKDFIFOLOCK_LOCKED 1
101#define NFSLOCKDFIFOLOCK_WANT 2
102
103/*
104 * XXX
105 * We have to let the process know if the call succeeded. I'm using an extra
106 * field in the uu_nlminfo field in the uthread structure, as it is already for
107 * lockd stuff.
108 */
109
110/*
111 * nfs_advlock --
112 * NFS advisory byte-level locks.
113 */
114int
115nfs_dolock(struct vop_advlock_args *ap)
116/* struct vop_advlock_args {
117 struct vnodeop_desc *a_desc;
118 struct vnode *a_vp;
119 caddr_t a_id;
120 int a_op;
121 struct flock *a_fl;
122 int a_flags;
123}; */
124{
125 LOCKD_MSG msg;
126 struct nameidata nd;
127 struct vnode *vp, *wvp;
128 struct nfsnode *np;
129 int error, error1;
130 struct flock *fl;
131 int fmode, ioflg;
132 struct proc *p;
133 struct uthread *ut;
134 struct timeval elapsed;
135 struct nfsmount *nmp;
136 struct vattr vattr;
137 off_t start, end;
138
139 ut = get_bsdthread_info(current_act());
140 p = current_proc();
141
142 vp = ap->a_vp;
143 fl = ap->a_fl;
144 np = VTONFS(vp);
145
146 nmp = VFSTONFS(vp->v_mount);
147 if (!nmp)
148 return (ENXIO);
149 if (nmp->nm_flag & NFSMNT_NOLOCKS)
150 return (EOPNOTSUPP);
151
152 /*
153 * The NLM protocol doesn't allow the server to return an error
154 * on ranges, so we do it. Pre LFS (Large File Summit)
155 * standards required EINVAL for the range errors. More recent
156 * standards use EOVERFLOW, but their EINVAL wording still
157 * encompasses these errors.
158 * Any code sensitive to this is either:
159 * 1) written pre-LFS and so can handle only EINVAL, or
160 * 2) written post-LFS and thus ought to be tolerant of pre-LFS
161 * implementations.
162 * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
163 */
164 if (fl->l_whence != SEEK_END) {
165 if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
166 fl->l_start < 0 ||
167 (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
168 (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
169 return (EINVAL);
170 }
171 /*
172 * If daemon is running take a ref on its fifo
173 */
174 if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) {
175 if (!nfslockdwaiting)
176 return (EOPNOTSUPP);
177 /*
178 * Don't wake lock daemon if it hasn't been started yet and
179 * this is an unlock request (since we couldn't possibly
180 * actually have a lock on the file). This could be an
181 * uninformed unlock request due to closef()'s behavior of doing
182 * unlocks on all files if a process has had a lock on ANY file.
183 */
184 if (!nfslockdfp && (fl->l_type == F_UNLCK))
185 return (EINVAL);
186 /* wake up lock daemon */
187 (void)wakeup((void *)&nfslockdwaiting);
188 /* wait on nfslockdfp for a while to allow daemon to start */
189 tsleep((void *)&nfslockdfp, PCATCH | PUSER, "lockd", 60*hz);
190 /* check for nfslockdfp and f_data */
191 if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data))
192 return (EOPNOTSUPP);
193 }
194 VREF(wvp);
195 /*
196 * if there is no nfsowner table yet, allocate one.
197 */
198 if (ut->uu_nlminfo == NULL) {
199 if (ap->a_op == F_UNLCK) {
200 vrele(wvp);
201 return (0);
202 }
203 MALLOC(ut->uu_nlminfo, struct nlminfo *,
204 sizeof(struct nlminfo), M_LOCKF, M_WAITOK | M_ZERO);
205 ut->uu_nlminfo->pid_start = p->p_stats->p_start;
206 }
207 /*
208 * Fill in the information structure.
209 */
210 msg.lm_version = LOCKD_MSG_VERSION;
211 msg.lm_msg_ident.pid = p->p_pid;
212 msg.lm_msg_ident.ut = ut;
213 msg.lm_msg_ident.pid_start = ut->uu_nlminfo->pid_start;
214 msg.lm_msg_ident.msg_seq = ++(ut->uu_nlminfo->msg_seq);
215
216 /*
217 * The NFS Lock Manager protocol doesn't directly handle
218 * negative lengths or SEEK_END, so we need to normalize
219 * things here where we have all the info.
220 * (Note: SEEK_CUR is already adjusted for at this point)
221 */
222 /* Convert the flock structure into a start and end. */
223 switch (fl->l_whence) {
224 case SEEK_SET:
225 case SEEK_CUR:
226 /*
227 * Caller is responsible for adding any necessary offset
228 * to fl->l_start when SEEK_CUR is used.
229 */
230 start = fl->l_start;
231 break;
232 case SEEK_END:
233 /* need to flush, and refetch attributes to make */
234 /* sure we have the correct end of file offset */
235 if (np->n_flag & NMODIFIED) {
236 np->n_attrstamp = 0;
237 error = nfs_vinvalbuf(vp, V_SAVE, p->p_ucred, p, 1);
238 if (error) {
239 vrele(wvp);
240 return (error);
241 }
242 }
243 np->n_attrstamp = 0;
244 error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
245 if (error) {
246 vrele(wvp);
247 return (error);
248 }
249 start = np->n_size + fl->l_start;
250 break;
251 default:
252 vrele(wvp);
253 return (EINVAL);
254 }
255 if (fl->l_len == 0)
256 end = -1;
257 else if (fl->l_len > 0)
258 end = start + fl->l_len - 1;
259 else { /* l_len is negative */
260 end = start - 1;
261 start += fl->l_len;
262 }
263 if (start < 0) {
264 vrele(wvp);
265 return (EINVAL);
266 }
267
268 msg.lm_fl = *fl;
269 msg.lm_fl.l_start = start;
270 if (end != -1)
271 msg.lm_fl.l_len = end - start + 1;
272
273 msg.lm_wait = ap->a_flags & F_WAIT;
274 msg.lm_getlk = ap->a_op == F_GETLK;
275
276 nmp = VFSTONFS(vp->v_mount);
277 if (!nmp) {
278 vrele(wvp);
279 return (ENXIO);
280 }
281
282 bcopy(mtod(nmp->nm_nam, struct sockaddr *), &msg.lm_addr,
283 min(sizeof msg.lm_addr,
284 mtod(nmp->nm_nam, struct sockaddr *)->sa_len));
285 msg.lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
286 bcopy(VTONFS(vp)->n_fhp, msg.lm_fh, msg.lm_fh_len);
287 msg.lm_nfsv3 = NFS_ISV3(vp);
288 cru2x(p->p_ucred, &msg.lm_cred);
289
290 microuptime(&ut->uu_nlminfo->nlm_lockstart);
291
292 fmode = FFLAGS(O_WRONLY);
293 if ((error = VOP_OPEN(wvp, fmode, kernproc->p_ucred, p))) {
294 vrele(wvp);
295 return (error);
296 }
297 ++wvp->v_writecount;
298
299#define IO_NOMACCHECK 0;
300 ioflg = IO_UNIT | IO_NOMACCHECK;
301 for (;;) {
302 VOP_LEASE(wvp, p, kernproc->p_ucred, LEASE_WRITE);
303
304 while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) {
305 nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT;
306 if (tsleep((void *)&nfslockdfifolock, PCATCH | PUSER, "lockdfifo", 20*hz))
307 break;
308 }
309 nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED;
310
311 error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)&msg, sizeof(msg), 0,
312 UIO_SYSSPACE, ioflg, kernproc->p_ucred, NULL, p);
313
314 nfslockdfifowritten = 1;
315
316 nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED;
317 if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) {
318 nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT;
319 wakeup((void *)&nfslockdfifolock);
320 }
321 /* wake up lock daemon */
322 if (nfslockdwaiting)
323 (void)wakeup((void *)&nfslockdwaiting);
324
325 if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) {
326 break;
327 }
328 /*
329 * If we're locking a file, wait for an answer. Unlocks succeed
330 * immediately.
331 */
332 if (fl->l_type == F_UNLCK)
333 /*
334 * XXX this isn't exactly correct. The client side
335 * needs to continue sending it's unlock until
336 * it gets a response back.
337 */
338 break;
339
340 /*
341 * retry after 20 seconds if we haven't gotten a response yet.
342 * This number was picked out of thin air... but is longer
343 * then even a reasonably loaded system should take (at least
344 * on a local network). XXX Probably should use a back-off
345 * scheme.
346 */
347 if ((error = tsleep((void *)ut->uu_nlminfo,
348 PCATCH | PUSER, "lockd", 20*hz)) != 0) {
349 if (error == EWOULDBLOCK) {
350 /*
351 * We timed out, so we rewrite the request
352 * to the fifo, but only if it isn't already
353 * full.
354 */
355 ioflg |= IO_NDELAY;
356 continue;
357 }
358
359 break;
360 }
361
362 if (msg.lm_getlk && ut->uu_nlminfo->retcode == 0) {
363 if (ut->uu_nlminfo->set_getlk) {
364 fl->l_pid = ut->uu_nlminfo->getlk_pid;
365 fl->l_start = ut->uu_nlminfo->getlk_start;
366 fl->l_len = ut->uu_nlminfo->getlk_len;
367 fl->l_whence = SEEK_SET;
368 } else {
369 fl->l_type = F_UNLCK;
370 }
371 }
372 error = ut->uu_nlminfo->retcode;
373 break;
374 }
375
376 /* XXX stats */
377 nfsadvlocks++;
378 microuptime(&elapsed);
379 timevalsub(&elapsed, &ut->uu_nlminfo->nlm_lockstart);
380 if (timevalcmp(&elapsed, &nfsadvlock_longest, >))
381 nfsadvlock_longest = elapsed;
382 timevaladd(&nfsadvlocks_time, &elapsed);
383 timerclear(&ut->uu_nlminfo->nlm_lockstart);
384
385 error1 = vn_close(wvp, FWRITE, kernproc->p_ucred, p);
386 /* prefer any previous 'error' to our vn_close 'error1'. */
387 return (error != 0 ? error : error1);
388}
389
390/*
391 * nfslockdans --
392 * NFS advisory byte-level locks answer from the lock daemon.
393 */
394int
395nfslockdans(struct proc *p, struct lockd_ans *ansp)
396{
397 struct proc *targetp;
398 struct uthread *targetut, *uth;
399 int error;
400
401 /*
402 * Let root, or someone who once was root (lockd generally
403 * switches to the daemon uid once it is done setting up) make
404 * this call.
405 *
406 * XXX This authorization check is probably not right.
407 */
408 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0 &&
409 p->p_cred->p_svuid != 0)
410 return (error);
411
412 /* the version should match, or we're out of sync */
413 if (ansp->la_vers != LOCKD_ANS_VERSION)
414 return (EINVAL);
415
416 /* Find the process & thread */
417 if ((targetp = pfind(ansp->la_msg_ident.pid)) == NULL)
418 return (ESRCH);
419 targetut = ansp->la_msg_ident.ut;
420 TAILQ_FOREACH(uth, &targetp->p_uthlist, uu_list) {
421 if (uth == targetut)
422 break;
423 }
424 /*
425 * Verify the pid hasn't been reused (if we can), and it isn't waiting
426 * for an answer from a more recent request. We return an EPIPE if
427 * the match fails, because we've already used ESRCH above, and this
428 * is sort of like writing on a pipe after the reader has closed it.
429 * If only the seq# is off, don't return an error just return. It could
430 * just be a response to a retransmitted request.
431 */
432 if (uth == NULL || uth != targetut || targetut->uu_nlminfo == NULL)
433 return (EPIPE);
434 if (ansp->la_msg_ident.msg_seq != -1) {
435 if (timevalcmp(&targetut->uu_nlminfo->pid_start,
436 &ansp->la_msg_ident.pid_start, !=))
437 return (EPIPE);
438 if (targetut->uu_nlminfo->msg_seq != ansp->la_msg_ident.msg_seq)
439 return (0);
440 }
441
442 /* Found the thread, so set its return errno and wake it up. */
443
444 targetut->uu_nlminfo->retcode = ansp->la_errno;
445 targetut->uu_nlminfo->set_getlk = ansp->la_getlk_set;
446 targetut->uu_nlminfo->getlk_pid = ansp->la_getlk_pid;
447 targetut->uu_nlminfo->getlk_start = ansp->la_getlk_start;
448 targetut->uu_nlminfo->getlk_len = ansp->la_getlk_len;
449
450 (void)wakeup((void *)targetut->uu_nlminfo);
451
452 return (0);
453}
454
455/*
456 * nfslockdfd --
457 * NFS advisory byte-level locks: fifo file# from the lock daemon.
458 */
459int
460nfslockdfd(struct proc *p, int fd)
461{
462 int error;
463 struct file *fp, *ofp;
464
465 error = suser(p->p_ucred, &p->p_acflag);
466 if (error)
467 return (error);
468 if (fd < 0) {
469 fp = 0;
470 } else {
471 error = getvnode(p, fd, &fp);
472 if (error)
473 return (error);
474 (void)fref(fp);
475 }
476 ofp = nfslockdfp;
477 nfslockdfp = fp;
478 if (ofp)
479 (void)frele(ofp);
480 nfslockdpid = nfslockdfp ? p->p_pid : 0;
481 (void)wakeup((void *)&nfslockdfp);
482 return (0);
483}
484
485/*
486 * nfslockdwait --
487 * lock daemon waiting for lock request
488 */
489int
490nfslockdwait(struct proc *p)
491{
492 int error;
493 struct file *fp, *ofp;
494
495 if (p->p_pid != nfslockdpid) {
496 error = suser(p->p_ucred, &p->p_acflag);
497 if (error)
498 return (error);
499 }
500 if (nfslockdwaiting)
501 return (EBUSY);
502 if (nfslockdfifowritten) {
503 nfslockdfifowritten = 0;
504 return (0);
505 }
506
507 nfslockdwaiting = 1;
508 tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0);
509 nfslockdwaiting = 0;
510
511 return (0);
512}