]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_lock.c
xnu-1504.3.12.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_lock.c
1 /*
2 * Copyright (c) 2002-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*-
29 * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 * notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions and the following disclaimer in the
38 * documentation and/or other materials provided with the distribution.
39 * 3. Berkeley Software Design Inc's name may not be used to endorse or
40 * promote products derived from this software without specific prior
41 * written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
56 */
57
58 #include <sys/cdefs.h>
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/fcntl.h>
62 #include <sys/kernel.h> /* for hz */
63 #include <sys/file_internal.h>
64 #include <sys/malloc.h>
65 #include <sys/lockf.h> /* for hz */ /* Must come after sys/malloc.h */
66 #include <sys/kpi_mbuf.h>
67 #include <sys/mount_internal.h>
68 #include <sys/proc_internal.h> /* for p_start */
69 #include <sys/kauth.h>
70 #include <sys/resourcevar.h>
71 #include <sys/socket.h>
72 #include <sys/unistd.h>
73 #include <sys/user.h>
74 #include <sys/vnode_internal.h>
75
76 #include <kern/thread.h>
77 #include <kern/host.h>
78
79 #include <machine/limits.h>
80
81 #include <net/if.h>
82
83 #include <nfs/rpcv2.h>
84 #include <nfs/nfsproto.h>
85 #include <nfs/nfs.h>
86 #include <nfs/nfs_gss.h>
87 #include <nfs/nfsmount.h>
88 #include <nfs/nfsnode.h>
89 #include <nfs/nfs_lock.h>
90
91 #include <mach/host_priv.h>
92 #include <mach/mig_errors.h>
93 #include <mach/host_special_ports.h>
94 #include <lockd/lockd_mach.h>
95
96 extern void ipc_port_release_send(ipc_port_t);
97
98 #define OFF_MAX QUAD_MAX
99
100 /*
101 * pending lock request messages are kept in this queue which is
102 * kept sorted by transaction ID (xid).
103 */
104 static uint64_t nfs_lockxid = 0;
105 static LOCKD_MSG_QUEUE nfs_pendlockq;
106
107 /*
108 * This structure is used to identify processes which have acquired NFS locks.
109 * Knowing which processes have ever acquired locks allows us to short-circuit
110 * unlock requests for processes that have never had an NFS file lock. Thus
111 * avoiding a costly and unnecessary lockd request.
112 */
113 struct nfs_lock_pid {
114 TAILQ_ENTRY(nfs_lock_pid) lp_lru; /* LRU list */
115 LIST_ENTRY(nfs_lock_pid) lp_hash; /* hash chain */
116 int lp_valid; /* valid entry? */
117 int lp_time; /* last time seen valid */
118 pid_t lp_pid; /* The process ID. */
119 struct timeval lp_pid_start; /* Start time of process id */
120 };
121
122 #define NFS_LOCK_PID_HASH_SIZE 64 // XXX tune me
123 #define NFS_LOCK_PID_HASH(pid) \
124 (&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash])
125 static LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl;
126 static TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru;
127 static u_long nfs_lock_pid_hash;
128 static uint32_t nfs_lock_pid_hash_trusted;
129
130 static lck_grp_t *nfs_lock_lck_grp;
131 static lck_mtx_t *nfs_lock_mutex;
132
133 void nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *);
134 void nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *);
135 int nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *, struct lockd_ans *);
136 LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_answer(struct lockd_ans *);
137 LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_xid(uint64_t);
138 uint64_t nfs_lockxid_get(void);
139 int nfs_lock_pid_check(proc_t, int);
140 int nfs_lockd_send_request(LOCKD_MSG *, int);
141
142 /*
143 * initialize global nfs lock state
144 */
145 void
146 nfs_lockinit(void)
147 {
148 TAILQ_INIT(&nfs_pendlockq);
149 nfs_lock_pid_hash_trusted = 1;
150 nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE,
151 M_TEMP, &nfs_lock_pid_hash);
152 TAILQ_INIT(&nfs_lock_pid_lru);
153
154 nfs_lock_lck_grp = lck_grp_alloc_init("nfs_lock", LCK_GRP_ATTR_NULL);
155 nfs_lock_mutex = lck_mtx_alloc_init(nfs_lock_lck_grp, LCK_ATTR_NULL);
156 }
157
158 /*
159 * change the count of NFS mounts that may need to make lockd requests
160 *
161 * If the mount count drops to zero, then send a shutdown request to
162 * lockd if we've sent any requests to it.
163 */
164 void
165 nfs_lockd_mount_change(int i)
166 {
167 mach_port_t lockd_port = IPC_PORT_NULL;
168 kern_return_t kr;
169 int send_shutdown;
170
171 lck_mtx_lock(nfs_lock_mutex);
172
173 nfs_lockd_mounts += i;
174
175 /* send a shutdown request if there are no more lockd mounts */
176 send_shutdown = ((nfs_lockd_mounts == 0) && nfs_lockd_request_sent);
177 if (send_shutdown)
178 nfs_lockd_request_sent = 0;
179
180 lck_mtx_unlock(nfs_lock_mutex);
181
182 if (!send_shutdown)
183 return;
184
185 /*
186 * Let lockd know that it is no longer need for any NFS mounts
187 */
188 kr = host_get_lockd_port(host_priv_self(), &lockd_port);
189 if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(lockd_port)) {
190 printf("nfs_lockd_mount_change: shutdown couldn't get port, kr %d, port %s\n",
191 kr, (lockd_port == IPC_PORT_NULL) ? "NULL" :
192 (lockd_port == IPC_PORT_DEAD) ? "DEAD" : "VALID");
193 return;
194 }
195
196 kr = lockd_shutdown(lockd_port);
197 if (kr != KERN_SUCCESS)
198 printf("nfs_lockd_mount_change: shutdown %d\n", kr);
199
200 ipc_port_release_send(lockd_port);
201 }
202
203 /*
204 * insert a lock request message into the pending queue
205 * (nfs_lock_mutex must be held)
206 */
207 inline void
208 nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq)
209 {
210 LOCKD_MSG_REQUEST *mr;
211
212 mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue);
213 if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
214 /* fast path: empty queue or new largest xid */
215 TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next);
216 return;
217 }
218 /* slow path: need to walk list to find insertion point */
219 while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
220 mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next);
221 }
222 if (mr) {
223 TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next);
224 } else {
225 TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next);
226 }
227 }
228
229 /*
230 * remove a lock request message from the pending queue
231 * (nfs_lock_mutex must be held)
232 */
233 inline void
234 nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq)
235 {
236 TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next);
237 }
238
239 /*
240 * find a pending lock request message by xid
241 *
242 * We search from the head of the list assuming that the message we're
243 * looking for is for an older request (because we have an answer to it).
244 * This assumes that lock request will be answered primarily in FIFO order.
245 * However, this may not be the case if there are blocked requests. We may
246 * want to move blocked requests to a separate queue (but that'll complicate
247 * duplicate xid checking).
248 *
249 * (nfs_lock_mutex must be held)
250 */
251 inline LOCKD_MSG_REQUEST *
252 nfs_lockdmsg_find_by_xid(uint64_t lockxid)
253 {
254 LOCKD_MSG_REQUEST *mr;
255
256 TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
257 if (mr->lmr_msg.lm_xid == lockxid)
258 return mr;
259 if (mr->lmr_msg.lm_xid > lockxid)
260 return NULL;
261 }
262 return mr;
263 }
264
265 /*
266 * Because we can't depend on nlm_granted messages containing the same
267 * cookie we sent with the original lock request, we need code test if
268 * an nlm_granted answer matches the lock request. We also need code
269 * that can find a lockd message based solely on the nlm_granted answer.
270 */
271
272 /*
273 * compare lockd message to answer
274 *
275 * returns 0 on equality and 1 if different
276 */
277 inline int
278 nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp)
279 {
280 if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
281 return 1;
282 if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid)
283 return 1;
284 if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start)
285 return 1;
286 if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len)
287 return 1;
288 if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len)
289 return 1;
290 if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len))
291 return 1;
292 return 0;
293 }
294
295 /*
296 * find a pending lock request message based on the lock info provided
297 * in the lockd_ans/nlm_granted data. We need this because we can't
298 * depend on nlm_granted messages containing the same cookie we sent
299 * with the original lock request.
300 *
301 * We search from the head of the list assuming that the message we're
302 * looking for is for an older request (because we have an answer to it).
303 * This assumes that lock request will be answered primarily in FIFO order.
304 * However, this may not be the case if there are blocked requests. We may
305 * want to move blocked requests to a separate queue (but that'll complicate
306 * duplicate xid checking).
307 *
308 * (nfs_lock_mutex must be held)
309 */
310 inline LOCKD_MSG_REQUEST *
311 nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp)
312 {
313 LOCKD_MSG_REQUEST *mr;
314
315 if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
316 return NULL;
317 TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
318 if (!nfs_lockdmsg_compare_to_answer(mr, ansp))
319 break;
320 }
321 return mr;
322 }
323
324 /*
325 * return the next unique lock request transaction ID
326 * (nfs_lock_mutex must be held)
327 */
328 inline uint64_t
329 nfs_lockxid_get(void)
330 {
331 LOCKD_MSG_REQUEST *mr;
332
333 /* derive initial lock xid from system time */
334 if (!nfs_lockxid) {
335 /*
336 * Note: it's OK if this code inits nfs_lockxid to 0 (for example,
337 * due to a broken clock) because we immediately increment it
338 * and we guarantee to never use xid 0. So, nfs_lockxid should only
339 * ever be 0 the first time this function is called.
340 */
341 struct timeval tv;
342 microtime(&tv);
343 nfs_lockxid = (uint64_t)tv.tv_sec << 12;
344 }
345
346 /* make sure we get a unique xid */
347 do {
348 /* Skip zero xid if it should ever happen. */
349 if (++nfs_lockxid == 0)
350 nfs_lockxid++;
351 if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) ||
352 (mr->lmr_msg.lm_xid < nfs_lockxid)) {
353 /* fast path: empty queue or new largest xid */
354 break;
355 }
356 /* check if xid is already in use */
357 } while (nfs_lockdmsg_find_by_xid(nfs_lockxid));
358
359 return nfs_lockxid;
360 }
361
362
363 /*
364 * Check the nfs_lock_pid hash table for an entry and, if requested,
365 * add the entry if it is not found.
366 *
367 * (Also, if adding, try to clean up some stale entries.)
368 * (nfs_lock_mutex must be held)
369 */
370 int
371 nfs_lock_pid_check(proc_t p, int addflag)
372 {
373 struct nfs_lock_pid *lp, *lplru, *lplru_next, *mlp;
374 TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_free;
375 proc_t plru = PROC_NULL;
376 pid_t pid;
377 int error = 0;
378 struct timeval now;
379
380 TAILQ_INIT(&nfs_lock_pid_free);
381 mlp = NULL;
382
383 loop:
384 /* Search hash chain */
385 pid = proc_pid(p);
386 error = ENOENT;
387 lp = NFS_LOCK_PID_HASH(pid)->lh_first;
388 for (; lp != NULL; lp = lp->lp_hash.le_next)
389 if (lp->lp_pid == pid) {
390 /* found pid... */
391 if (timevalcmp(&lp->lp_pid_start, &p->p_start, ==)) {
392 /* ...and it's valid */
393 /* move to tail of LRU */
394 TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
395 microuptime(&now);
396 lp->lp_time = now.tv_sec;
397 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
398 error = 0;
399 break;
400 }
401 /* ...but it's no longer valid */
402 /* remove from hash, invalidate, and move to lru head */
403 LIST_REMOVE(lp, lp_hash);
404 lp->lp_valid = 0;
405 TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
406 TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru);
407 lp = NULL;
408 break;
409 }
410
411 /* if we didn't find it (valid), use any newly allocated one */
412 if (!lp)
413 lp = mlp;
414
415 /* if we don't have an lp and we've been asked to add it */
416 if ((error == ENOENT) && addflag && !lp) {
417 /* scan lru list for invalid, stale entries to reuse/free */
418 int lrucnt = 0;
419 microuptime(&now);
420 for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) {
421 lplru_next = TAILQ_NEXT(lplru, lp_lru);
422 if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) {
423 /*
424 * If the oldest LRU entry is relatively new, then don't
425 * bother scanning any further.
426 */
427 break;
428 }
429 /* remove entry from LRU, and check if it's still in use */
430 TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru);
431 if (!lplru->lp_valid || !(plru = proc_find(lplru->lp_pid)) ||
432 timevalcmp(&lplru->lp_pid_start, &plru->p_start, !=)) {
433 if (plru != PROC_NULL) {
434 proc_rele(plru);
435 plru = PROC_NULL;
436 }
437 /* no longer in use */
438 LIST_REMOVE(lplru, lp_hash);
439 if (!lp) {
440 /* we'll reuse this one */
441 lp = lplru;
442 } else {
443 /* queue it up for freeing */
444 TAILQ_INSERT_HEAD(&nfs_lock_pid_free, lplru, lp_lru);
445 }
446 } else {
447 /* still in use */
448 if (plru != PROC_NULL) {
449 proc_rele(plru);
450 plru = PROC_NULL;
451 }
452 lplru->lp_time = now.tv_sec;
453 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru);
454 }
455 /* don't check too many entries at once */
456 if (++lrucnt > 8)
457 break;
458 }
459 if (!lp) {
460 /* we need to allocate a new one */
461 lck_mtx_unlock(nfs_lock_mutex);
462 MALLOC(mlp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid),
463 M_TEMP, M_WAITOK | M_ZERO);
464 lck_mtx_lock(nfs_lock_mutex);
465 if (mlp) /* make sure somebody hasn't already added this guy */
466 goto loop;
467 error = ENOMEM;
468 }
469 }
470 if ((error == ENOENT) && addflag && lp) {
471 /* (re)initialize nfs_lock_pid info */
472 lp->lp_pid = pid;
473 lp->lp_pid_start = p->p_start;
474 /* insert pid in hash */
475 LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash);
476 lp->lp_valid = 1;
477 lp->lp_time = now.tv_sec;
478 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
479 error = 0;
480 }
481
482 if ((mlp && (lp != mlp)) || TAILQ_FIRST(&nfs_lock_pid_free)) {
483 lck_mtx_unlock(nfs_lock_mutex);
484 if (mlp && (lp != mlp)) {
485 /* we didn't need this one, so we can free it */
486 FREE(mlp, M_TEMP);
487 }
488 /* free up any stale entries */
489 while ((lp = TAILQ_FIRST(&nfs_lock_pid_free))) {
490 TAILQ_REMOVE(&nfs_lock_pid_free, lp, lp_lru);
491 FREE(lp, M_TEMP);
492 }
493 lck_mtx_lock(nfs_lock_mutex);
494 }
495
496 return (error);
497 }
498
499 #define MACH_MAX_TRIES 3
500
501 int
502 nfs_lockd_send_request(LOCKD_MSG *msg, int interruptable)
503 {
504 kern_return_t kr;
505 int retries = 0;
506 mach_port_t lockd_port = IPC_PORT_NULL;
507
508 kr = host_get_lockd_port(host_priv_self(), &lockd_port);
509 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(lockd_port))
510 return (ENOTSUP);
511
512 do {
513 /* In the kernel all mach messaging is interruptable */
514 do {
515 kr = lockd_request(
516 lockd_port,
517 msg->lm_version,
518 msg->lm_flags,
519 msg->lm_xid,
520 msg->lm_fl.l_start,
521 msg->lm_fl.l_len,
522 msg->lm_fl.l_pid,
523 msg->lm_fl.l_type,
524 msg->lm_fl.l_whence,
525 (uint32_t *)&msg->lm_addr,
526 (uint32_t *)&msg->lm_cred,
527 msg->lm_fh_len,
528 msg->lm_fh);
529 if (kr != KERN_SUCCESS)
530 printf("lockd_request received %d!\n", kr);
531 } while (!interruptable && kr == MACH_SEND_INTERRUPTED);
532 } while (kr == MIG_SERVER_DIED && retries++ < MACH_MAX_TRIES);
533
534 ipc_port_release_send(lockd_port);
535 switch (kr) {
536 case MACH_SEND_INTERRUPTED:
537 return (EINTR);
538 default:
539 /*
540 * Other MACH or MIG errors we will retry. Eventually
541 * we will call nfs_down and allow the user to disable
542 * locking.
543 */
544 return (EAGAIN);
545 }
546 return (kr);
547 }
548
549
550 /*
551 * NFS advisory byte-level locks (client)
552 */
553 int
554 nfs3_vnop_advlock(
555 struct vnop_advlock_args /* {
556 struct vnodeop_desc *a_desc;
557 vnode_t a_vp;
558 caddr_t a_id;
559 int a_op;
560 struct flock *a_fl;
561 int a_flags;
562 vfs_context_t a_context;
563 } */ *ap)
564 {
565 vfs_context_t ctx;
566 proc_t p;
567 LOCKD_MSG_REQUEST msgreq;
568 LOCKD_MSG *msg;
569 vnode_t vp;
570 nfsnode_t np;
571 int error, error2;
572 int interruptable, modified;
573 struct flock *fl;
574 struct nfsmount *nmp;
575 struct nfs_vattr nvattr;
576 off_t start, end;
577 struct timeval now;
578 int timeo, endtime, lastmsg, wentdown = 0;
579 int lockpidcheck, nfsvers;
580 struct sockaddr *saddr;
581 struct timespec ts;
582
583 ctx = ap->a_context;
584 p = vfs_context_proc(ctx);
585 vp = ap->a_vp;
586 fl = ap->a_fl;
587 np = VTONFS(vp);
588
589 nmp = VTONMP(vp);
590 if (!nmp)
591 return (ENXIO);
592 lck_mtx_lock(&nmp->nm_lock);
593 if (nmp->nm_flag & NFSMNT_NOLOCKS) {
594 lck_mtx_unlock(&nmp->nm_lock);
595 return (ENOTSUP);
596 }
597 nfsvers = nmp->nm_vers;
598 lck_mtx_unlock(&nmp->nm_lock);
599
600 /*
601 * The NLM protocol doesn't allow the server to return an error
602 * on ranges, so we do it. Pre LFS (Large File Summit)
603 * standards required EINVAL for the range errors. More recent
604 * standards use EOVERFLOW, but their EINVAL wording still
605 * encompasses these errors.
606 * Any code sensitive to this is either:
607 * 1) written pre-LFS and so can handle only EINVAL, or
608 * 2) written post-LFS and thus ought to be tolerant of pre-LFS
609 * implementations.
610 * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
611 */
612 if (fl->l_whence != SEEK_END) {
613 if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
614 fl->l_start < 0 ||
615 (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
616 (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
617 return (EINVAL);
618 }
619
620 lck_mtx_lock(nfs_lock_mutex);
621
622 /*
623 * Need to check if this process has successfully acquired an NFS lock before.
624 * If not, and this is an unlock request we can simply return success here.
625 */
626 lockpidcheck = nfs_lock_pid_check(p, 0);
627 lck_mtx_unlock(nfs_lock_mutex);
628 if (lockpidcheck) {
629 if (lockpidcheck != ENOENT)
630 return (lockpidcheck);
631 if ((ap->a_op == F_UNLCK) && nfs_lock_pid_hash_trusted)
632 return (0);
633 }
634
635 /*
636 * The NFS Lock Manager protocol doesn't directly handle
637 * negative lengths or SEEK_END, so we need to normalize
638 * things here where we have all the info.
639 * (Note: SEEK_CUR is already adjusted for at this point)
640 */
641 /* Convert the flock structure into a start and end. */
642 switch (fl->l_whence) {
643 case SEEK_SET:
644 case SEEK_CUR:
645 /*
646 * Caller is responsible for adding any necessary offset
647 * to fl->l_start when SEEK_CUR is used.
648 */
649 start = fl->l_start;
650 break;
651 case SEEK_END:
652 /* need to flush, and refetch attributes to make */
653 /* sure we have the correct end of file offset */
654 if ((error = nfs_node_lock(np)))
655 return (error);
656 modified = (np->n_flag & NMODIFIED);
657 nfs_node_unlock(np);
658 if (modified && ((error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1))))
659 return (error);
660 if ((error = nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED)))
661 return (error);
662 nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
663 start = np->n_size + fl->l_start;
664 nfs_data_unlock(np);
665 break;
666 default:
667 return (EINVAL);
668 }
669 if (fl->l_len == 0)
670 end = -1;
671 else if (fl->l_len > 0)
672 end = start + fl->l_len - 1;
673 else { /* l_len is negative */
674 end = start - 1;
675 start += fl->l_len;
676 }
677 if (start < 0)
678 return (EINVAL);
679
680 if ((nfsvers == NFS_VER2) &&
681 ((start >= 0x80000000) || (end >= 0x80000000)))
682 return (EINVAL);
683
684 /*
685 * Fill in the information structure.
686 * We set all values to zero with bzero to clear
687 * out any information in the sockaddr_storage
688 * and nfs_filehandle contained in msgreq so that
689 * we will not leak extraneous information out of
690 * the kernel when calling up to lockd via our mig
691 * generated routine.
692 */
693 bzero(&msgreq, sizeof(msgreq));
694 msg = &msgreq.lmr_msg;
695 msg->lm_version = LOCKD_MSG_VERSION;
696 msg->lm_flags = 0;
697
698 msg->lm_fl = *fl;
699 msg->lm_fl.l_start = start;
700 if (end != -1)
701 msg->lm_fl.l_len = end - start + 1;
702 msg->lm_fl.l_pid = vfs_context_pid(ctx);
703
704 if (ap->a_flags & F_WAIT)
705 msg->lm_flags |= LOCKD_MSG_BLOCK;
706 if (ap->a_op == F_GETLK)
707 msg->lm_flags |= LOCKD_MSG_TEST;
708
709 nmp = VTONMP(vp);
710 if (!nmp)
711 return (ENXIO);
712
713 lck_mtx_lock(&nmp->nm_lock);
714 saddr = mbuf_data(nmp->nm_nam);
715 bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len));
716 msg->lm_fh_len = (nfsvers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize;
717 bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len);
718 if (nfsvers == NFS_VER3)
719 msg->lm_flags |= LOCKD_MSG_NFSV3;
720 cru2x(vfs_context_ucred(ctx), &msg->lm_cred);
721
722 microuptime(&now);
723 lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
724 interruptable = nmp->nm_flag & NFSMNT_INT;
725 lck_mtx_unlock(&nmp->nm_lock);
726
727 lck_mtx_lock(nfs_lock_mutex);
728
729 /* allocate unique xid */
730 msg->lm_xid = nfs_lockxid_get();
731 nfs_lockdmsg_enqueue(&msgreq);
732
733 timeo = 2;
734
735 for (;;) {
736 nfs_lockd_request_sent = 1;
737
738 /* need to drop nfs_lock_mutex while calling nfs_lockd_send_request() */
739 lck_mtx_unlock(nfs_lock_mutex);
740 error = nfs_lockd_send_request(msg, interruptable);
741 lck_mtx_lock(nfs_lock_mutex);
742 if (error && error != EAGAIN)
743 break;
744
745 /*
746 * Always wait for an answer. Not waiting for unlocks could
747 * cause a lock to be left if the unlock request gets dropped.
748 */
749
750 /*
751 * Retry if it takes too long to get a response.
752 *
753 * The timeout numbers were picked out of thin air... they start
754 * at 2 and double each timeout with a max of 60 seconds.
755 *
756 * In order to maintain responsiveness, we pass a small timeout
757 * to msleep and calculate the timeouts ourselves. This allows
758 * us to pick up on mount changes quicker.
759 */
760 wait_for_granted:
761 error = EWOULDBLOCK;
762 ts.tv_sec = 2;
763 ts.tv_nsec = 0;
764 microuptime(&now);
765 endtime = now.tv_sec + timeo;
766 while (now.tv_sec < endtime) {
767 error = error2 = 0;
768 if (!msgreq.lmr_answered)
769 error = msleep(&msgreq, nfs_lock_mutex, PCATCH | PUSER, "lockd", &ts);
770 if (msgreq.lmr_answered) {
771 /*
772 * Note: it's possible to have a lock granted at
773 * essentially the same time that we get interrupted.
774 * Since the lock may be granted, we can't return an
775 * error from this request or we might not unlock the
776 * lock that's been granted.
777 */
778 nmp = VTONMP(vp);
779 if ((msgreq.lmr_errno == ENOTSUP) && nmp &&
780 (nmp->nm_state & NFSSTA_LOCKSWORK)) {
781 /*
782 * We have evidence that locks work, yet lockd
783 * returned ENOTSUP. This is probably because
784 * it was unable to contact the server's lockd
785 * to send it the request.
786 *
787 * Because we know locks work, we'll consider
788 * this failure to be a timeout.
789 */
790 error = EWOULDBLOCK;
791 } else {
792 error = 0;
793 }
794 break;
795 }
796 if (error != EWOULDBLOCK)
797 break;
798 /* check that we still have our mount... */
799 /* ...and that we still support locks */
800 nmp = VTONMP(vp);
801 if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) {
802 error = error2;
803 if (fl->l_type == F_UNLCK)
804 printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error);
805 break;
806 }
807 lck_mtx_lock(&nmp->nm_lock);
808 if (nmp->nm_flag & NFSMNT_NOLOCKS) {
809 lck_mtx_unlock(&nmp->nm_lock);
810 break;
811 }
812 interruptable = nmp->nm_flag & NFSMNT_INT;
813 lck_mtx_unlock(&nmp->nm_lock);
814 microuptime(&now);
815 }
816 if (error) {
817 /* check that we still have our mount... */
818 nmp = VTONMP(vp);
819 if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) {
820 error = error2;
821 if (error2 != EINTR) {
822 if (fl->l_type == F_UNLCK)
823 printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error);
824 break;
825 }
826 }
827 /* ...and that we still support locks */
828 lck_mtx_lock(&nmp->nm_lock);
829 if (nmp->nm_flag & NFSMNT_NOLOCKS) {
830 if (error == EWOULDBLOCK)
831 error = ENOTSUP;
832 lck_mtx_unlock(&nmp->nm_lock);
833 break;
834 }
835 interruptable = nmp->nm_flag & NFSMNT_INT;
836 if (error != EWOULDBLOCK) {
837 lck_mtx_unlock(&nmp->nm_lock);
838 /*
839 * We're going to bail on this request.
840 * If we were a blocked lock request, send a cancel.
841 */
842 if ((msgreq.lmr_errno == EINPROGRESS) &&
843 !(msg->lm_flags & LOCKD_MSG_CANCEL)) {
844 /* set this request up as a cancel */
845 msg->lm_flags |= LOCKD_MSG_CANCEL;
846 nfs_lockdmsg_dequeue(&msgreq);
847 msg->lm_xid = nfs_lockxid_get();
848 nfs_lockdmsg_enqueue(&msgreq);
849 msgreq.lmr_saved_errno = error;
850 msgreq.lmr_errno = 0;
851 msgreq.lmr_answered = 0;
852 /* reset timeout */
853 timeo = 2;
854 /* send cancel request */
855 continue;
856 }
857 break;
858 }
859
860 /* warn if we're not getting any response */
861 microuptime(&now);
862 if ((msgreq.lmr_errno != EINPROGRESS) &&
863 !(msg->lm_flags & LOCKD_MSG_DENIED_GRACE) &&
864 (nmp->nm_tprintf_initial_delay != 0) &&
865 ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
866 lck_mtx_unlock(&nmp->nm_lock);
867 lastmsg = now.tv_sec;
868 nfs_down(nmp, vfs_context_thread(ctx), 0, NFSSTA_LOCKTIMEO, "lockd not responding");
869 wentdown = 1;
870 } else
871 lck_mtx_unlock(&nmp->nm_lock);
872
873 if (msgreq.lmr_errno == EINPROGRESS) {
874 /*
875 * We've got a blocked lock request that we are
876 * going to retry. First, we'll want to try to
877 * send a cancel for the previous request.
878 *
879 * Clear errno so if we don't get a response
880 * to the resend we'll call nfs_down().
881 * Also reset timeout because we'll expect a
882 * quick response to the cancel/resend (even if
883 * it is NLM_BLOCKED).
884 */
885 msg->lm_flags |= LOCKD_MSG_CANCEL;
886 nfs_lockdmsg_dequeue(&msgreq);
887 msg->lm_xid = nfs_lockxid_get();
888 nfs_lockdmsg_enqueue(&msgreq);
889 msgreq.lmr_saved_errno = msgreq.lmr_errno;
890 msgreq.lmr_errno = 0;
891 msgreq.lmr_answered = 0;
892 timeo = 2;
893 /* send cancel then resend request */
894 continue;
895 }
896
897 if (msg->lm_flags & LOCKD_MSG_DENIED_GRACE) {
898 /*
899 * Time to resend a request previously denied due to a grace period.
900 */
901 msg->lm_flags &= ~LOCKD_MSG_DENIED_GRACE;
902 nfs_lockdmsg_dequeue(&msgreq);
903 msg->lm_xid = nfs_lockxid_get();
904 nfs_lockdmsg_enqueue(&msgreq);
905 msgreq.lmr_saved_errno = 0;
906 msgreq.lmr_errno = 0;
907 msgreq.lmr_answered = 0;
908 timeo = 2;
909 /* resend request */
910 continue;
911 }
912
913 /*
914 * We timed out, so we will resend the request.
915 */
916 timeo *= 2;
917 if (timeo > 60)
918 timeo = 60;
919 /* resend request */
920 continue;
921 }
922
923 /* we got a reponse, so the server's lockd is OK */
924 nfs_up(VTONMP(vp), vfs_context_thread(ctx), NFSSTA_LOCKTIMEO,
925 wentdown ? "lockd alive again" : NULL);
926 wentdown = 0;
927
928 if (msgreq.lmr_answered && (msg->lm_flags & LOCKD_MSG_DENIED_GRACE)) {
929 /*
930 * The lock request was denied because the server lockd is
931 * still in its grace period. So, we need to try the
932 * request again in a little bit.
933 */
934 timeo = 4;
935 msgreq.lmr_answered = 0;
936 goto wait_for_granted;
937 }
938
939 if (msgreq.lmr_errno == EINPROGRESS) {
940 /* got NLM_BLOCKED response */
941 /* need to wait for NLM_GRANTED */
942 timeo = 60;
943 msgreq.lmr_answered = 0;
944 goto wait_for_granted;
945 }
946
947 if ((msg->lm_flags & LOCKD_MSG_CANCEL) &&
948 (msgreq.lmr_saved_errno == EINPROGRESS)) {
949 /*
950 * We just got a successful reply to the
951 * cancel of the previous blocked lock request.
952 * Now, go ahead and resend the request.
953 */
954 msg->lm_flags &= ~LOCKD_MSG_CANCEL;
955 nfs_lockdmsg_dequeue(&msgreq);
956 msg->lm_xid = nfs_lockxid_get();
957 nfs_lockdmsg_enqueue(&msgreq);
958 msgreq.lmr_saved_errno = 0;
959 msgreq.lmr_errno = 0;
960 msgreq.lmr_answered = 0;
961 timeo = 2;
962 /* resend request */
963 continue;
964 }
965
966 if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) {
967 if (msg->lm_fl.l_type != F_UNLCK) {
968 fl->l_type = msg->lm_fl.l_type;
969 fl->l_pid = msg->lm_fl.l_pid;
970 fl->l_start = msg->lm_fl.l_start;
971 fl->l_len = msg->lm_fl.l_len;
972 fl->l_whence = SEEK_SET;
973 } else
974 fl->l_type = F_UNLCK;
975 }
976
977 /*
978 * If the blocked lock request was cancelled.
979 * Restore the error condition from when we
980 * originally bailed on the request.
981 */
982 if (msg->lm_flags & LOCKD_MSG_CANCEL) {
983 msg->lm_flags &= ~LOCKD_MSG_CANCEL;
984 error = msgreq.lmr_saved_errno;
985 } else
986 error = msgreq.lmr_errno;
987
988 nmp = VTONMP(vp);
989 if ((error == ENOTSUP) && nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK)) {
990 /*
991 * We have NO evidence that locks work and lockd
992 * returned ENOTSUP. Let's take this as a hint
993 * that locks aren't supported and disable them
994 * for this mount.
995 */
996 lck_mtx_lock(&nmp->nm_lock);
997 nmp->nm_flag |= NFSMNT_NOLOCKS;
998 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
999 lck_mtx_unlock(&nmp->nm_lock);
1000 printf("lockd returned ENOTSUP, disabling locks for nfs server: %s\n",
1001 vfs_statfs(nmp->nm_mountp)->f_mntfromname);
1002 }
1003 if (!error) {
1004 /* record that NFS file locking has worked on this mount */
1005 if (nmp) {
1006 lck_mtx_lock(&nmp->nm_lock);
1007 if (!(nmp->nm_state & NFSSTA_LOCKSWORK))
1008 nmp->nm_state |= NFSSTA_LOCKSWORK;
1009 lck_mtx_unlock(&nmp->nm_lock);
1010 }
1011 /*
1012 * If we successfully acquired a lock, make sure this pid
1013 * is in the nfs_lock_pid hash table so we know we can't
1014 * short-circuit unlock requests.
1015 */
1016 if ((lockpidcheck == ENOENT) &&
1017 ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW))) {
1018 error = nfs_lock_pid_check(p, 1);
1019 if (error) {
1020 /*
1021 * We couldn't add the pid to the table,
1022 * so we can no longer trust that a pid
1023 * not in the table has no locks.
1024 */
1025 nfs_lock_pid_hash_trusted = 0;
1026 printf("nfs_vnop_advlock: pid add failed - no longer trusted\n");
1027 }
1028 }
1029 }
1030 break;
1031 }
1032
1033 nfs_lockdmsg_dequeue(&msgreq);
1034
1035 lck_mtx_unlock(nfs_lock_mutex);
1036
1037 return (error);
1038 }
1039
1040 /*
1041 * nfslockdans --
1042 * NFS advisory byte-level locks answer from the lock daemon.
1043 */
1044 int
1045 nfslockdans(proc_t p, struct lockd_ans *ansp)
1046 {
1047 LOCKD_MSG_REQUEST *msgreq;
1048 int error;
1049
1050 /* Let root make this call. */
1051 error = proc_suser(p);
1052 if (error)
1053 return (error);
1054
1055 /* the version should match, or we're out of sync */
1056 if (ansp->la_version != LOCKD_ANS_VERSION)
1057 return (EINVAL);
1058
1059 lck_mtx_lock(nfs_lock_mutex);
1060
1061 /* try to find the lockd message by transaction id (cookie) */
1062 msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
1063 if (ansp->la_flags & LOCKD_ANS_GRANTED) {
1064 /*
1065 * We can't depend on the granted message having our cookie,
1066 * so we check the answer against the lockd message found.
1067 * If no message was found or it doesn't match the answer,
1068 * we look for the lockd message by the answer's lock info.
1069 */
1070 if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp))
1071 msgreq = nfs_lockdmsg_find_by_answer(ansp);
1072 /*
1073 * We need to make sure this request isn't being cancelled
1074 * If it is, we don't want to accept the granted message.
1075 */
1076 if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL))
1077 msgreq = NULL;
1078 }
1079 if (!msgreq) {
1080 lck_mtx_unlock(nfs_lock_mutex);
1081 return (EPIPE);
1082 }
1083
1084 msgreq->lmr_errno = ansp->la_errno;
1085 if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) {
1086 if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) {
1087 if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL)
1088 msgreq->lmr_msg.lm_fl.l_type = F_WRLCK;
1089 else
1090 msgreq->lmr_msg.lm_fl.l_type = F_RDLCK;
1091 msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid;
1092 msgreq->lmr_msg.lm_fl.l_start = ansp->la_start;
1093 msgreq->lmr_msg.lm_fl.l_len = ansp->la_len;
1094 } else {
1095 msgreq->lmr_msg.lm_fl.l_type = F_UNLCK;
1096 }
1097 }
1098 if (ansp->la_flags & LOCKD_ANS_DENIED_GRACE)
1099 msgreq->lmr_msg.lm_flags |= LOCKD_MSG_DENIED_GRACE;
1100
1101 msgreq->lmr_answered = 1;
1102 lck_mtx_unlock(nfs_lock_mutex);
1103 wakeup(msgreq);
1104
1105 return (0);
1106 }
1107