bsd/nfs/nfs_lock.c

   1 /*
   2  * Copyright (c) 2002-2008 Apple Inc.  All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*-
  29  * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
  30  *
  31  * Redistribution and use in source and binary forms, with or without
  32  * modification, are permitted provided that the following conditions
  33  * are met:
  34  * 1. Redistributions of source code must retain the above copyright
  35  *    notice, this list of conditions and the following disclaimer.
  36  * 2. Redistributions in binary form must reproduce the above copyright
  37  *    notice, this list of conditions and the following disclaimer in the
  38  *    documentation and/or other materials provided with the distribution.
  39  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  40  *    promote products derived from this software without specific prior
  41  *    written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
  56  */
  57
  58 #include <sys/cdefs.h>
  59 #include <sys/param.h>
  60 #include <sys/systm.h>
  61 #include <sys/fcntl.h>
  62 #include <sys/kernel.h>         /* for hz */
  63 #include <sys/file_internal.h>
  64 #include <sys/malloc.h>
  65 #include <sys/lockf.h>          /* for hz */ /* Must come after sys/malloc.h */
  66 #include <sys/kpi_mbuf.h>
  67 #include <sys/mount_internal.h>
  68 #include <sys/proc_internal.h>  /* for p_start */
  69 #include <sys/kauth.h>
  70 #include <sys/resourcevar.h>
  71 #include <sys/socket.h>
  72 #include <sys/unistd.h>
  73 #include <sys/user.h>
  74 #include <sys/vnode_internal.h>
  75
  76 #include <kern/thread.h>
  77 #include <kern/host.h>
  78
  79 #include <machine/limits.h>
  80
  81 #include <net/if.h>
  82
  83 #include <nfs/rpcv2.h>
  84 #include <nfs/nfsproto.h>
  85 #include <nfs/nfs.h>
  86 #include <nfs/nfs_gss.h>
  87 #include <nfs/nfsmount.h>
  88 #include <nfs/nfsnode.h>
  89 #include <nfs/nfs_lock.h>
  90
  91 #include <mach/host_priv.h>
  92 #include <mach/mig_errors.h>
  93 #include <mach/host_special_ports.h>
  94 #include <lockd/lockd_mach.h>
  95
  96 extern void ipc_port_release_send(ipc_port_t);
  97
  98 #define OFF_MAX QUAD_MAX
  99
 100 /*
 101  * pending lock request messages are kept in this queue which is
 102  * kept sorted by transaction ID (xid).
 103  */
 104 static uint64_t nfs_lockxid = 0;
 105 static LOCKD_MSG_QUEUE nfs_pendlockq;
 106
 107 /*
 108  * This structure is used to identify processes which have acquired NFS locks.
 109  * Knowing which processes have ever acquired locks allows us to short-circuit
 110  * unlock requests for processes that have never had an NFS file lock.  Thus
 111  * avoiding a costly and unnecessary lockd request.
 112  */
 113 struct nfs_lock_pid {
 114         TAILQ_ENTRY(nfs_lock_pid)       lp_lru;         /* LRU list */
 115         LIST_ENTRY(nfs_lock_pid)        lp_hash;        /* hash chain */
 116         int                             lp_valid;       /* valid entry? */
 117         int                             lp_time;        /* last time seen valid */
 118         pid_t                           lp_pid;         /* The process ID. */
 119         struct timeval                  lp_pid_start;   /* Start time of process id */
 120 };
 121
 122 #define NFS_LOCK_PID_HASH_SIZE          64      // XXX tune me
 123 #define NFS_LOCK_PID_HASH(pid)  \
 124         (&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash])
 125 static LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl;
 126 static TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru;
 127 static u_long nfs_lock_pid_hash;
 128 static uint32_t nfs_lock_pid_hash_trusted;
 129
 130 static lck_grp_t *nfs_lock_lck_grp;
 131 static lck_mtx_t *nfs_lock_mutex;
 132
 133 void nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *);
 134 void nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *);
 135 int nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *, struct lockd_ans *);
 136 LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_answer(struct lockd_ans *);
 137 LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_xid(uint64_t);
 138 uint64_t nfs_lockxid_get(void);
 139 int nfs_lock_pid_check(proc_t, int);
 140 int nfs_lockd_send_request(LOCKD_MSG *, int);
 141
 142 /*
 143  * initialize global nfs lock state
 144  */
 145 void
 146 nfs_lockinit(void)
 147 {
 148         TAILQ_INIT(&nfs_pendlockq);
 149         nfs_lock_pid_hash_trusted = 1;
 150         nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE,
 151                                          M_TEMP, &nfs_lock_pid_hash);
 152         TAILQ_INIT(&nfs_lock_pid_lru);
 153
 154         nfs_lock_lck_grp = lck_grp_alloc_init("nfs_lock", LCK_GRP_ATTR_NULL);
 155         nfs_lock_mutex = lck_mtx_alloc_init(nfs_lock_lck_grp, LCK_ATTR_NULL);
 156 }
 157
 158 /*
 159  * change the count of NFS mounts that may need to make lockd requests
 160  *
 161  * If the mount count drops to zero, then send a shutdown request to
 162  * lockd if we've sent any requests to it.
 163  */
 164 void
 165 nfs_lockd_mount_change(int i)
 166 {
 167         mach_port_t lockd_port = IPC_PORT_NULL;
 168         kern_return_t kr;
 169         int send_shutdown;
 170
 171         lck_mtx_lock(nfs_lock_mutex);
 172
 173         nfs_lockd_mounts += i;
 174
 175         /* send a shutdown request if there are no more lockd mounts */
 176         send_shutdown = ((nfs_lockd_mounts == 0) && nfs_lockd_request_sent);
 177         if (send_shutdown)
 178                 nfs_lockd_request_sent = 0;
 179
 180         lck_mtx_unlock(nfs_lock_mutex);
 181
 182         if (!send_shutdown)
 183                 return;
 184
 185         /*
 186          * Let lockd know that it is no longer need for any NFS mounts
 187          */
 188         kr = host_get_lockd_port(host_priv_self(), &lockd_port);
 189         if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(lockd_port)) {
 190                 printf("nfs_lockd_mount_change: shutdown couldn't get port, kr %d, port %s\n",
 191                         kr, (lockd_port == IPC_PORT_NULL) ? "NULL" :
 192                         (lockd_port == IPC_PORT_DEAD) ? "DEAD" : "VALID");
 193                 return;
 194         }
 195
 196         kr = lockd_shutdown(lockd_port);
 197         if (kr != KERN_SUCCESS)
 198                 printf("nfs_lockd_mount_change: shutdown %d\n", kr);
 199
 200         ipc_port_release_send(lockd_port);
 201 }
 202
 203 /*
 204  * insert a lock request message into the pending queue
 205  * (nfs_lock_mutex must be held)
 206  */
 207 inline void
 208 nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq)
 209 {
 210         LOCKD_MSG_REQUEST *mr;
 211
 212         mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue);
 213         if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 214                 /* fast path: empty queue or new largest xid */
 215                 TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next);
 216                 return;
 217         }
 218         /* slow path: need to walk list to find insertion point */
 219         while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 220                 mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next);
 221         }
 222         if (mr) {
 223                 TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next);
 224         } else {
 225                 TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next);
 226         }
 227 }
 228
 229 /*
 230  * remove a lock request message from the pending queue
 231  * (nfs_lock_mutex must be held)
 232  */
 233 inline void
 234 nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq)
 235 {
 236         TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next);
 237 }
 238
 239 /*
 240  * find a pending lock request message by xid
 241  *
 242  * We search from the head of the list assuming that the message we're
 243  * looking for is for an older request (because we have an answer to it).
 244  * This assumes that lock request will be answered primarily in FIFO order.
 245  * However, this may not be the case if there are blocked requests.  We may
 246  * want to move blocked requests to a separate queue (but that'll complicate
 247  * duplicate xid checking).
 248  *
 249  * (nfs_lock_mutex must be held)
 250  */
 251 inline LOCKD_MSG_REQUEST *
 252 nfs_lockdmsg_find_by_xid(uint64_t lockxid)
 253 {
 254         LOCKD_MSG_REQUEST *mr;
 255
 256         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 257                 if (mr->lmr_msg.lm_xid == lockxid)
 258                         return mr;
 259                 if (mr->lmr_msg.lm_xid > lockxid)
 260                         return NULL;
 261         }
 262         return mr;
 263 }
 264
 265 /*
 266  * Because we can't depend on nlm_granted messages containing the same
 267  * cookie we sent with the original lock request, we need code test if
 268  * an nlm_granted answer matches the lock request.  We also need code
 269  * that can find a lockd message based solely on the nlm_granted answer.
 270  */
 271
 272 /*
 273  * compare lockd message to answer
 274  *
 275  * returns 0 on equality and 1 if different
 276  */
 277 inline int
 278 nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp)
 279 {
 280         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 281                 return 1;
 282         if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid)
 283                 return 1;
 284         if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start)
 285                 return 1;
 286         if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len)
 287                 return 1;
 288         if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len)
 289                 return 1;
 290         if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len))
 291                 return 1;
 292         return 0;
 293 }
 294
 295 /*
 296  * find a pending lock request message based on the lock info provided
 297  * in the lockd_ans/nlm_granted data.  We need this because we can't
 298  * depend on nlm_granted messages containing the same cookie we sent
 299  * with the original lock request.
 300  *
 301  * We search from the head of the list assuming that the message we're
 302  * looking for is for an older request (because we have an answer to it).
 303  * This assumes that lock request will be answered primarily in FIFO order.
 304  * However, this may not be the case if there are blocked requests.  We may
 305  * want to move blocked requests to a separate queue (but that'll complicate
 306  * duplicate xid checking).
 307  *
 308  * (nfs_lock_mutex must be held)
 309  */
 310 inline LOCKD_MSG_REQUEST *
 311 nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp)
 312 {
 313         LOCKD_MSG_REQUEST *mr;
 314
 315         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 316                 return NULL;
 317         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 318                 if (!nfs_lockdmsg_compare_to_answer(mr, ansp))
 319                         break;
 320         }
 321         return mr;
 322 }
 323
 324 /*
 325  * return the next unique lock request transaction ID
 326  * (nfs_lock_mutex must be held)
 327  */
 328 inline uint64_t
 329 nfs_lockxid_get(void)
 330 {
 331         LOCKD_MSG_REQUEST *mr;
 332
 333         /* derive initial lock xid from system time */
 334         if (!nfs_lockxid) {
 335                 /*
 336                  * Note: it's OK if this code inits nfs_lockxid to 0 (for example,
 337                  * due to a broken clock) because we immediately increment it
 338                  * and we guarantee to never use xid 0.  So, nfs_lockxid should only
 339                  * ever be 0 the first time this function is called.
 340                  */
 341                 struct timeval tv;
 342                 microtime(&tv);
 343                 nfs_lockxid = (uint64_t)tv.tv_sec << 12;
 344         }
 345
 346         /* make sure we get a unique xid */
 347         do {
 348                 /* Skip zero xid if it should ever happen.  */
 349                 if (++nfs_lockxid == 0)
 350                         nfs_lockxid++;
 351                 if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) ||
 352                      (mr->lmr_msg.lm_xid < nfs_lockxid)) {
 353                         /* fast path: empty queue or new largest xid */
 354                         break;
 355                 }
 356                 /* check if xid is already in use */
 357         } while (nfs_lockdmsg_find_by_xid(nfs_lockxid));
 358
 359         return nfs_lockxid;
 360 }
 361
 362
 363 /*
 364  * Check the nfs_lock_pid hash table for an entry and, if requested,
 365  * add the entry if it is not found.
 366  *
 367  * (Also, if adding, try to clean up some stale entries.)
 368  * (nfs_lock_mutex must be held)
 369  */
 370 int
 371 nfs_lock_pid_check(proc_t p, int addflag)
 372 {
 373         struct nfs_lock_pid *lp, *lplru, *lplru_next, *mlp;
 374         TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_free;
 375         proc_t plru = PROC_NULL;
 376         pid_t pid;
 377         int error = 0;
 378         struct timeval now;
 379
 380         TAILQ_INIT(&nfs_lock_pid_free);
 381         mlp = NULL;
 382
 383 loop:
 384         /* Search hash chain */
 385         pid = proc_pid(p);
 386         error = ENOENT;
 387         lp = NFS_LOCK_PID_HASH(pid)->lh_first;
 388         for (; lp != NULL; lp = lp->lp_hash.le_next)
 389                 if (lp->lp_pid == pid) {
 390                         /* found pid... */
 391                         if (timevalcmp(&lp->lp_pid_start, &p->p_start, ==)) {
 392                                 /* ...and it's valid */
 393                                 /* move to tail of LRU */
 394                                 TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 395                                 microuptime(&now);
 396                                 lp->lp_time = now.tv_sec;
 397                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 398                                 error = 0;
 399                                 break;
 400                         }
 401                         /* ...but it's no longer valid */
 402                         /* remove from hash, invalidate, and move to lru head */
 403                         LIST_REMOVE(lp, lp_hash);
 404                         lp->lp_valid = 0;
 405                         TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 406                         TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru);
 407                         lp = NULL;
 408                         break;
 409                 }
 410
 411         /* if we didn't find it (valid), use any newly allocated one */
 412         if (!lp)
 413                 lp = mlp;
 414
 415         /* if we don't have an lp and we've been asked to add it */
 416         if ((error == ENOENT) && addflag && !lp) {
 417                 /* scan lru list for invalid, stale entries to reuse/free */
 418                 int lrucnt = 0;
 419                 microuptime(&now);
 420                 for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) {
 421                         lplru_next = TAILQ_NEXT(lplru, lp_lru);
 422                         if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) {
 423                                 /*
 424                                  * If the oldest LRU entry is relatively new, then don't
 425                                  * bother scanning any further.
 426                                  */
 427                                 break;
 428                         }
 429                         /* remove entry from LRU, and check if it's still in use */
 430                         TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru);
 431                         if (!lplru->lp_valid || !(plru = proc_find(lplru->lp_pid)) ||
 432                             timevalcmp(&lplru->lp_pid_start, &plru->p_start, !=)) {
 433                                 if (plru != PROC_NULL) {
 434                                         proc_rele(plru);
 435                                         plru = PROC_NULL;
 436                                 }
 437                                 /* no longer in use */
 438                                 LIST_REMOVE(lplru, lp_hash);
 439                                 if (!lp) {
 440                                         /* we'll reuse this one */
 441                                         lp = lplru;
 442                                 } else {
 443                                         /* queue it up for freeing */
 444                                         TAILQ_INSERT_HEAD(&nfs_lock_pid_free, lplru, lp_lru);
 445                                 }
 446                         } else {
 447                                 /* still in use */
 448                                 if (plru != PROC_NULL) {
 449                                         proc_rele(plru);
 450                                         plru = PROC_NULL;
 451                                 }
 452                                 lplru->lp_time = now.tv_sec;
 453                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru);
 454                         }
 455                         /* don't check too many entries at once */
 456                         if (++lrucnt > 8)
 457                                 break;
 458                 }
 459                 if (!lp) {
 460                         /* we need to allocate a new one */
 461                         lck_mtx_unlock(nfs_lock_mutex);
 462                         MALLOC(mlp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid),
 463                                 M_TEMP, M_WAITOK | M_ZERO);
 464                         lck_mtx_lock(nfs_lock_mutex);
 465                         if (mlp) /* make sure somebody hasn't already added this guy */
 466                                 goto loop;
 467                         error = ENOMEM;
 468                 }
 469         }
 470         if ((error == ENOENT) && addflag && lp) {
 471                 /* (re)initialize nfs_lock_pid info */
 472                 lp->lp_pid = pid;
 473                 lp->lp_pid_start = p->p_start;
 474                 /* insert pid in hash */
 475                 LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash);
 476                 lp->lp_valid = 1;
 477                 lp->lp_time = now.tv_sec;
 478                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 479                 error = 0;
 480         }
 481
 482         if ((mlp && (lp != mlp)) || TAILQ_FIRST(&nfs_lock_pid_free)) {
 483                 lck_mtx_unlock(nfs_lock_mutex);
 484                 if (mlp && (lp != mlp)) {
 485                         /* we didn't need this one, so we can free it */
 486                         FREE(mlp, M_TEMP);
 487                 }
 488                 /* free up any stale entries */
 489                 while ((lp = TAILQ_FIRST(&nfs_lock_pid_free))) {
 490                         TAILQ_REMOVE(&nfs_lock_pid_free, lp, lp_lru);
 491                         FREE(lp, M_TEMP);
 492                 }
 493                 lck_mtx_lock(nfs_lock_mutex);
 494         }
 495
 496         return (error);
 497 }
 498
 499 #define MACH_MAX_TRIES 3
 500
 501 int
 502 nfs_lockd_send_request(LOCKD_MSG *msg, int interruptable)
 503 {
 504         kern_return_t kr;
 505         int retries = 0;
 506         mach_port_t lockd_port = IPC_PORT_NULL;
 507
 508         kr = host_get_lockd_port(host_priv_self(), &lockd_port);
 509         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(lockd_port))
 510                 return (ENOTSUP);
 511
 512         do {
 513                 /* In the kernel all mach messaging is interruptable */
 514                 do {
 515                         kr = lockd_request(
 516                                 lockd_port,
 517                                 msg->lm_version,
 518                                 msg->lm_flags,
 519                                 msg->lm_xid,
 520                                 msg->lm_fl.l_start,
 521                                 msg->lm_fl.l_len,
 522                                 msg->lm_fl.l_pid,
 523                                 msg->lm_fl.l_type,
 524                                 msg->lm_fl.l_whence,
 525                                 (uint32_t *)&msg->lm_addr,
 526                                 (uint32_t *)&msg->lm_cred,
 527                                 msg->lm_fh_len,
 528                                 msg->lm_fh);
 529                         if (kr != KERN_SUCCESS)
 530                                 printf("lockd_request received %d!\n", kr);
 531                 } while (!interruptable && kr == MACH_SEND_INTERRUPTED);
 532         } while (kr == MIG_SERVER_DIED && retries++ < MACH_MAX_TRIES);
 533
 534         ipc_port_release_send(lockd_port);
 535         switch (kr) {
 536         case MACH_SEND_INTERRUPTED:
 537                 return (EINTR);
 538         default:
 539                 /*
 540                  * Other MACH or MIG errors we will retry. Eventually
 541                  * we will call nfs_down and allow the user to disable
 542                  * locking.
 543                  */
 544                 return (EAGAIN);
 545         }
 546         return (kr);
 547 }
 548
 549
 550 /*
 551  * NFS advisory byte-level locks (client)
 552  */
 553 int
 554 nfs3_vnop_advlock(
 555         struct vnop_advlock_args /* {
 556                 struct vnodeop_desc *a_desc;
 557                 vnode_t a_vp;
 558                 caddr_t a_id;
 559                 int a_op;
 560                 struct flock *a_fl;
 561                 int a_flags;
 562                 vfs_context_t a_context;
 563         } */ *ap)
 564 {
 565         vfs_context_t ctx;
 566         proc_t p;
 567         LOCKD_MSG_REQUEST msgreq;
 568         LOCKD_MSG *msg;
 569         vnode_t vp;
 570         nfsnode_t np;
 571         int error, error2;
 572         int interruptable, modified;
 573         struct flock *fl;
 574         struct nfsmount *nmp;
 575         struct nfs_vattr nvattr;
 576         off_t start, end;
 577         struct timeval now;
 578         int timeo, endtime, lastmsg, wentdown = 0;
 579         int lockpidcheck, nfsvers;
 580         struct sockaddr *saddr;
 581         struct timespec ts;
 582
 583         ctx = ap->a_context;
 584         p = vfs_context_proc(ctx);
 585         vp = ap->a_vp;
 586         fl = ap->a_fl;
 587         np = VTONFS(vp);
 588
 589         nmp = VTONMP(vp);
 590         if (!nmp)
 591                 return (ENXIO);
 592         lck_mtx_lock(&nmp->nm_lock);
 593         if (nmp->nm_flag & NFSMNT_NOLOCKS) {
 594                 lck_mtx_unlock(&nmp->nm_lock);
 595                 return (ENOTSUP);
 596         }
 597         nfsvers = nmp->nm_vers;
 598         lck_mtx_unlock(&nmp->nm_lock);
 599
 600         /*
 601          * The NLM protocol doesn't allow the server to return an error
 602          * on ranges, so we do it.  Pre LFS (Large File Summit)
 603          * standards required EINVAL for the range errors.  More recent
 604          * standards use EOVERFLOW, but their EINVAL wording still
 605          * encompasses these errors.
 606          * Any code sensitive to this is either:
 607          *  1) written pre-LFS and so can handle only EINVAL, or
 608          *  2) written post-LFS and thus ought to be tolerant of pre-LFS
 609          *     implementations.
 610          * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
 611          */
 612         if (fl->l_whence != SEEK_END) {
 613                 if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
 614                     fl->l_start < 0 ||
 615                     (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
 616                     (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
 617                         return (EINVAL);
 618         }
 619
 620         lck_mtx_lock(nfs_lock_mutex);
 621
 622         /*
 623          * Need to check if this process has successfully acquired an NFS lock before.
 624          * If not, and this is an unlock request we can simply return success here.
 625          */
 626         lockpidcheck = nfs_lock_pid_check(p, 0);
 627         lck_mtx_unlock(nfs_lock_mutex);
 628         if (lockpidcheck) {
 629                 if (lockpidcheck != ENOENT)
 630                         return (lockpidcheck);
 631                 if ((ap->a_op == F_UNLCK) && nfs_lock_pid_hash_trusted)
 632                         return (0);
 633         }
 634
 635         /*
 636          * The NFS Lock Manager protocol doesn't directly handle
 637          * negative lengths or SEEK_END, so we need to normalize
 638          * things here where we have all the info.
 639          * (Note: SEEK_CUR is already adjusted for at this point)
 640          */
 641         /* Convert the flock structure into a start and end. */
 642         switch (fl->l_whence) {
 643         case SEEK_SET:
 644         case SEEK_CUR:
 645                 /*
 646                  * Caller is responsible for adding any necessary offset
 647                  * to fl->l_start when SEEK_CUR is used.
 648                  */
 649                 start = fl->l_start;
 650                 break;
 651         case SEEK_END:
 652                 /* need to flush, and refetch attributes to make */
 653                 /* sure we have the correct end of file offset   */
 654                 if ((error = nfs_node_lock(np)))
 655                         return (error);
 656                 modified = (np->n_flag & NMODIFIED);
 657                 nfs_node_unlock(np);
 658                 if (modified && ((error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1))))
 659                         return (error);
 660                 if ((error = nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED)))
 661                         return (error);
 662                 nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
 663                 start = np->n_size + fl->l_start;
 664                 nfs_data_unlock(np);
 665                 break;
 666         default:
 667                 return (EINVAL);
 668         }
 669         if (fl->l_len == 0)
 670                 end = -1;
 671         else if (fl->l_len > 0)
 672                 end = start + fl->l_len - 1;
 673         else { /* l_len is negative */
 674                 end = start - 1;
 675                 start += fl->l_len;
 676         }
 677         if (start < 0)
 678                 return (EINVAL);
 679
 680         if ((nfsvers == NFS_VER2) &&
 681             ((start >= 0x80000000) || (end >= 0x80000000)))
 682                 return (EINVAL);
 683
 684         /*
 685          * Fill in the information structure.
 686          * We set all values to zero with bzero to clear
 687          * out any information in the sockaddr_storage
 688          * and nfs_filehandle contained in msgreq so that
 689          * we will not leak extraneous information out of
 690          * the kernel when calling up to lockd via our mig
 691          * generated routine.
 692          */
 693         bzero(&msgreq, sizeof(msgreq));
 694         msg = &msgreq.lmr_msg;
 695         msg->lm_version = LOCKD_MSG_VERSION;
 696         msg->lm_flags = 0;
 697
 698         msg->lm_fl = *fl;
 699         msg->lm_fl.l_start = start;
 700         if (end != -1)
 701                 msg->lm_fl.l_len = end - start + 1;
 702         msg->lm_fl.l_pid = vfs_context_pid(ctx);
 703
 704         if (ap->a_flags & F_WAIT)
 705                 msg->lm_flags |= LOCKD_MSG_BLOCK;
 706         if (ap->a_op == F_GETLK)
 707                 msg->lm_flags |= LOCKD_MSG_TEST;
 708
 709         nmp = VTONMP(vp);
 710         if (!nmp)
 711                 return (ENXIO);
 712
 713         lck_mtx_lock(&nmp->nm_lock);
 714         saddr = mbuf_data(nmp->nm_nam);
 715         bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len));
 716         msg->lm_fh_len = (nfsvers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize;
 717         bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len);
 718         if (nfsvers == NFS_VER3)
 719                 msg->lm_flags |= LOCKD_MSG_NFSV3;
 720         cru2x(vfs_context_ucred(ctx), &msg->lm_cred);
 721
 722         microuptime(&now);
 723         lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
 724         interruptable = nmp->nm_flag & NFSMNT_INT;
 725         lck_mtx_unlock(&nmp->nm_lock);
 726
 727         lck_mtx_lock(nfs_lock_mutex);
 728
 729         /* allocate unique xid */
 730         msg->lm_xid = nfs_lockxid_get();
 731         nfs_lockdmsg_enqueue(&msgreq);
 732
 733         timeo = 2;
 734
 735         for (;;) {
 736                 nfs_lockd_request_sent = 1;
 737
 738                 /* need to drop nfs_lock_mutex while calling nfs_lockd_send_request() */
 739                 lck_mtx_unlock(nfs_lock_mutex);
 740                 error = nfs_lockd_send_request(msg, interruptable);
 741                 lck_mtx_lock(nfs_lock_mutex);
 742                 if (error && error != EAGAIN)
 743                         break;
 744
 745                 /*
 746                  * Always wait for an answer.  Not waiting for unlocks could
 747                  * cause a lock to be left if the unlock request gets dropped.
 748                  */
 749
 750                 /*
 751                  * Retry if it takes too long to get a response.
 752                  *
 753                  * The timeout numbers were picked out of thin air... they start
 754                  * at 2 and double each timeout with a max of 60 seconds.
 755                  *
 756                  * In order to maintain responsiveness, we pass a small timeout
 757                  * to msleep and calculate the timeouts ourselves.  This allows
 758                  * us to pick up on mount changes quicker.
 759                  */
 760 wait_for_granted:
 761                 error = EWOULDBLOCK;
 762                 ts.tv_sec = 2;
 763                 ts.tv_nsec = 0;
 764                 microuptime(&now);
 765                 endtime = now.tv_sec + timeo;
 766                 while (now.tv_sec < endtime) {
 767                         error = error2 = 0;
 768                         if (!msgreq.lmr_answered)
 769                                 error = msleep(&msgreq, nfs_lock_mutex, PCATCH | PUSER, "lockd", &ts);
 770                         if (msgreq.lmr_answered) {
 771                                 /*
 772                                  * Note: it's possible to have a lock granted at
 773                                  * essentially the same time that we get interrupted.
 774                                  * Since the lock may be granted, we can't return an
 775                                  * error from this request or we might not unlock the
 776                                  * lock that's been granted.
 777                                  */
 778                                 nmp = VTONMP(vp);
 779                                 if ((msgreq.lmr_errno == ENOTSUP) && nmp &&
 780                                     (nmp->nm_state & NFSSTA_LOCKSWORK)) {
 781                                         /*
 782                                          * We have evidence that locks work, yet lockd
 783                                          * returned ENOTSUP.  This is probably because
 784                                          * it was unable to contact the server's lockd
 785                                          * to send it the request.
 786                                          *
 787                                          * Because we know locks work, we'll consider
 788                                          * this failure to be a timeout.
 789                                          */
 790                                         error = EWOULDBLOCK;
 791                                 } else {
 792                                         error = 0;
 793                                 }
 794                                 break;
 795                         }
 796                         if (error != EWOULDBLOCK)
 797                                 break;
 798                         /* check that we still have our mount... */
 799                         /* ...and that we still support locks */
 800                         nmp = VTONMP(vp);
 801                         if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) {
 802                                 error = error2;
 803                                 if (fl->l_type == F_UNLCK)
 804                                         printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error);
 805                                 break;
 806                         }
 807                         lck_mtx_lock(&nmp->nm_lock);
 808                         if (nmp->nm_flag & NFSMNT_NOLOCKS) {
 809                                 lck_mtx_unlock(&nmp->nm_lock);
 810                                 break;
 811                         }
 812                         interruptable = nmp->nm_flag & NFSMNT_INT;
 813                         lck_mtx_unlock(&nmp->nm_lock);
 814                         microuptime(&now);
 815                 }
 816                 if (error) {
 817                         /* check that we still have our mount... */
 818                         nmp = VTONMP(vp);
 819                         if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) {
 820                                 error = error2;
 821                                 if (error2 != EINTR) {
 822                                         if (fl->l_type == F_UNLCK)
 823                                                 printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error);
 824                                         break;
 825                                 }
 826                         }
 827                         /* ...and that we still support locks */
 828                         lck_mtx_lock(&nmp->nm_lock);
 829                         if (nmp->nm_flag & NFSMNT_NOLOCKS) {
 830                                 if (error == EWOULDBLOCK)
 831                                         error = ENOTSUP;
 832                                 lck_mtx_unlock(&nmp->nm_lock);
 833                                 break;
 834                         }
 835                         interruptable = nmp->nm_flag & NFSMNT_INT;
 836                         if (error != EWOULDBLOCK) {
 837                                 lck_mtx_unlock(&nmp->nm_lock);
 838                                 /*
 839                                  * We're going to bail on this request.
 840                                  * If we were a blocked lock request, send a cancel.
 841                                  */
 842                                 if ((msgreq.lmr_errno == EINPROGRESS) &&
 843                                     !(msg->lm_flags & LOCKD_MSG_CANCEL)) {
 844                                         /* set this request up as a cancel */
 845                                         msg->lm_flags |= LOCKD_MSG_CANCEL;
 846                                         nfs_lockdmsg_dequeue(&msgreq);
 847                                         msg->lm_xid = nfs_lockxid_get();
 848                                         nfs_lockdmsg_enqueue(&msgreq);
 849                                         msgreq.lmr_saved_errno = error;
 850                                         msgreq.lmr_errno = 0;
 851                                         msgreq.lmr_answered = 0;
 852                                         /* reset timeout */
 853                                         timeo = 2;
 854                                         /* send cancel request */
 855                                         continue;
 856                                 }
 857                                 break;
 858                         }
 859
 860                         /* warn if we're not getting any response */
 861                         microuptime(&now);
 862                         if ((msgreq.lmr_errno != EINPROGRESS) &&
 863                             !(msg->lm_flags & LOCKD_MSG_DENIED_GRACE) &&
 864                             (nmp->nm_tprintf_initial_delay != 0) &&
 865                             ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
 866                                 lck_mtx_unlock(&nmp->nm_lock);
 867                                 lastmsg = now.tv_sec;
 868                                 nfs_down(nmp, vfs_context_thread(ctx), 0, NFSSTA_LOCKTIMEO, "lockd not responding");
 869                                 wentdown = 1;
 870                         } else
 871                                 lck_mtx_unlock(&nmp->nm_lock);
 872
 873                         if (msgreq.lmr_errno == EINPROGRESS) {
 874                                 /*
 875                                  * We've got a blocked lock request that we are
 876                                  * going to retry.  First, we'll want to try to
 877                                  * send a cancel for the previous request.
 878                                  *
 879                                  * Clear errno so if we don't get a response
 880                                  * to the resend we'll call nfs_down().
 881                                  * Also reset timeout because we'll expect a
 882                                  * quick response to the cancel/resend (even if
 883                                  * it is NLM_BLOCKED).
 884                                  */
 885                                 msg->lm_flags |= LOCKD_MSG_CANCEL;
 886                                 nfs_lockdmsg_dequeue(&msgreq);
 887                                 msg->lm_xid = nfs_lockxid_get();
 888                                 nfs_lockdmsg_enqueue(&msgreq);
 889                                 msgreq.lmr_saved_errno = msgreq.lmr_errno;
 890                                 msgreq.lmr_errno = 0;
 891                                 msgreq.lmr_answered = 0;
 892                                 timeo = 2;
 893                                 /* send cancel then resend request */
 894                                 continue;
 895                         }
 896
 897                         if (msg->lm_flags & LOCKD_MSG_DENIED_GRACE) {
 898                                 /*
 899                                  * Time to resend a request previously denied due to a grace period.
 900                                  */
 901                                 msg->lm_flags &= ~LOCKD_MSG_DENIED_GRACE;
 902                                 nfs_lockdmsg_dequeue(&msgreq);
 903                                 msg->lm_xid = nfs_lockxid_get();
 904                                 nfs_lockdmsg_enqueue(&msgreq);
 905                                 msgreq.lmr_saved_errno = 0;
 906                                 msgreq.lmr_errno = 0;
 907                                 msgreq.lmr_answered = 0;
 908                                 timeo = 2;
 909                                 /* resend request */
 910                                 continue;
 911                         }
 912
 913                         /*
 914                          * We timed out, so we will resend the request.
 915                          */
 916                         timeo *= 2;
 917                         if (timeo > 60)
 918                                 timeo = 60;
 919                         /* resend request */
 920                         continue;
 921                 }
 922
 923                 /* we got a reponse, so the server's lockd is OK */
 924                 nfs_up(VTONMP(vp), vfs_context_thread(ctx), NFSSTA_LOCKTIMEO,
 925                         wentdown ? "lockd alive again" : NULL);
 926                 wentdown = 0;
 927
 928                 if (msgreq.lmr_answered && (msg->lm_flags & LOCKD_MSG_DENIED_GRACE)) {
 929                         /*
 930                          * The lock request was denied because the server lockd is
 931                          * still in its grace period.  So, we need to try the
 932                          * request again in a little bit.
 933                          */
 934                         timeo = 4;
 935                         msgreq.lmr_answered = 0;
 936                         goto wait_for_granted;
 937                 }
 938
 939                 if (msgreq.lmr_errno == EINPROGRESS) {
 940                         /* got NLM_BLOCKED response */
 941                         /* need to wait for NLM_GRANTED */
 942                         timeo = 60;
 943                         msgreq.lmr_answered = 0;
 944                         goto wait_for_granted;
 945                 }
 946
 947                 if ((msg->lm_flags & LOCKD_MSG_CANCEL) &&
 948                     (msgreq.lmr_saved_errno == EINPROGRESS)) {
 949                         /*
 950                          * We just got a successful reply to the
 951                          * cancel of the previous blocked lock request.
 952                          * Now, go ahead and resend the request.
 953                          */
 954                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 955                         nfs_lockdmsg_dequeue(&msgreq);
 956                         msg->lm_xid = nfs_lockxid_get();
 957                         nfs_lockdmsg_enqueue(&msgreq);
 958                         msgreq.lmr_saved_errno = 0;
 959                         msgreq.lmr_errno = 0;
 960                         msgreq.lmr_answered = 0;
 961                         timeo = 2;
 962                         /* resend request */
 963                         continue;
 964                 }
 965
 966                 if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) {
 967                         if (msg->lm_fl.l_type != F_UNLCK) {
 968                                 fl->l_type = msg->lm_fl.l_type;
 969                                 fl->l_pid = msg->lm_fl.l_pid;
 970                                 fl->l_start = msg->lm_fl.l_start;
 971                                 fl->l_len = msg->lm_fl.l_len;
 972                                 fl->l_whence = SEEK_SET;
 973                         } else
 974                                 fl->l_type = F_UNLCK;
 975                 }
 976
 977                 /*
 978                  * If the blocked lock request was cancelled.
 979                  * Restore the error condition from when we
 980                  * originally bailed on the request.
 981                  */
 982                 if (msg->lm_flags & LOCKD_MSG_CANCEL) {
 983                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 984                         error = msgreq.lmr_saved_errno;
 985                 } else
 986                         error = msgreq.lmr_errno;
 987
 988                 nmp = VTONMP(vp);
 989                 if ((error == ENOTSUP) && nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK)) {
 990                         /*
 991                          * We have NO evidence that locks work and lockd
 992                          * returned ENOTSUP.  Let's take this as a hint
 993                          * that locks aren't supported and disable them
 994                          * for this mount.
 995                          */
 996                         lck_mtx_lock(&nmp->nm_lock);
 997                         nmp->nm_flag |= NFSMNT_NOLOCKS;
 998                         nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
 999                         lck_mtx_unlock(&nmp->nm_lock);
1000                         printf("lockd returned ENOTSUP, disabling locks for nfs server: %s\n",
1001                                 vfs_statfs(nmp->nm_mountp)->f_mntfromname);
1002                 }
1003                 if (!error) {
1004                         /* record that NFS file locking has worked on this mount */
1005                         if (nmp) {
1006                                 lck_mtx_lock(&nmp->nm_lock);
1007                                 if (!(nmp->nm_state & NFSSTA_LOCKSWORK))
1008                                         nmp->nm_state |= NFSSTA_LOCKSWORK;
1009                                 lck_mtx_unlock(&nmp->nm_lock);
1010                         }
1011                         /*
1012                          * If we successfully acquired a lock, make sure this pid
1013                          * is in the nfs_lock_pid hash table so we know we can't
1014                          * short-circuit unlock requests.
1015                          */
1016                         if ((lockpidcheck == ENOENT) &&
1017                             ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW))) {
1018                                 error = nfs_lock_pid_check(p, 1);
1019                                 if (error) {
1020                                         /*
1021                                          * We couldn't add the pid to the table,
1022                                          * so we can no longer trust that a pid
1023                                          * not in the table has no locks.
1024                                          */
1025                                         nfs_lock_pid_hash_trusted = 0;
1026                                         printf("nfs_vnop_advlock: pid add failed - no longer trusted\n");
1027                                 }
1028                         }
1029                 }
1030                 break;
1031         }
1032
1033         nfs_lockdmsg_dequeue(&msgreq);
1034
1035         lck_mtx_unlock(nfs_lock_mutex);
1036
1037         return (error);
1038 }
1039
1040 /*
1041  * nfslockdans --
1042  *      NFS advisory byte-level locks answer from the lock daemon.
1043  */
1044 int
1045 nfslockdans(proc_t p, struct lockd_ans *ansp)
1046 {
1047         LOCKD_MSG_REQUEST *msgreq;
1048         int error;
1049
1050         /* Let root make this call. */
1051         error = proc_suser(p);
1052         if (error)
1053                 return (error);
1054
1055         /* the version should match, or we're out of sync */
1056         if (ansp->la_version != LOCKD_ANS_VERSION)
1057                 return (EINVAL);
1058
1059         lck_mtx_lock(nfs_lock_mutex);
1060
1061         /* try to find the lockd message by transaction id (cookie) */
1062         msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
1063         if (ansp->la_flags & LOCKD_ANS_GRANTED) {
1064                 /*
1065                  * We can't depend on the granted message having our cookie,
1066                  * so we check the answer against the lockd message found.
1067                  * If no message was found or it doesn't match the answer,
1068                  * we look for the lockd message by the answer's lock info.
1069                  */
1070                 if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp))
1071                         msgreq = nfs_lockdmsg_find_by_answer(ansp);
1072                 /*
1073                  * We need to make sure this request isn't being cancelled
1074                  * If it is, we don't want to accept the granted message.
1075                  */
1076                 if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL))
1077                         msgreq = NULL;
1078         }
1079         if (!msgreq) {
1080                 lck_mtx_unlock(nfs_lock_mutex);
1081                 return (EPIPE);
1082         }
1083
1084         msgreq->lmr_errno = ansp->la_errno;
1085         if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) {
1086                 if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) {
1087                         if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL)
1088                                 msgreq->lmr_msg.lm_fl.l_type = F_WRLCK;
1089                         else
1090                                 msgreq->lmr_msg.lm_fl.l_type = F_RDLCK;
1091                         msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid;
1092                         msgreq->lmr_msg.lm_fl.l_start = ansp->la_start;
1093                         msgreq->lmr_msg.lm_fl.l_len = ansp->la_len;
1094                 } else {
1095                         msgreq->lmr_msg.lm_fl.l_type = F_UNLCK;
1096                 }
1097         }
1098         if (ansp->la_flags & LOCKD_ANS_DENIED_GRACE)
1099                 msgreq->lmr_msg.lm_flags |= LOCKD_MSG_DENIED_GRACE;
1100
1101         msgreq->lmr_answered = 1;
1102         lck_mtx_unlock(nfs_lock_mutex);
1103         wakeup(msgreq);
1104
1105         return (0);
1106 }
1107