bsd/nfs/nfs_lock.c

   1 /*
   2  * Copyright (c) 2002-2007 Apple Inc.  All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*-
  29  * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
  30  *
  31  * Redistribution and use in source and binary forms, with or without
  32  * modification, are permitted provided that the following conditions
  33  * are met:
  34  * 1. Redistributions of source code must retain the above copyright
  35  *    notice, this list of conditions and the following disclaimer.
  36  * 2. Redistributions in binary form must reproduce the above copyright
  37  *    notice, this list of conditions and the following disclaimer in the
  38  *    documentation and/or other materials provided with the distribution.
  39  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  40  *    promote products derived from this software without specific prior
  41  *    written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
  56  */
  57
  58 #include <sys/cdefs.h>
  59 #include <sys/param.h>
  60 #include <sys/systm.h>
  61 #include <sys/fcntl.h>
  62 #include <sys/kernel.h>         /* for hz */
  63 #include <sys/file_internal.h>
  64 #include <sys/malloc.h>
  65 #include <sys/lockf.h>          /* for hz */ /* Must come after sys/malloc.h */
  66 #include <sys/kpi_mbuf.h>
  67 #include <sys/mount_internal.h>
  68 #include <sys/proc_internal.h>  /* for p_start */
  69 #include <sys/kauth.h>
  70 #include <sys/resourcevar.h>
  71 #include <sys/socket.h>
  72 #include <sys/unistd.h>
  73 #include <sys/user.h>
  74 #include <sys/vnode_internal.h>
  75
  76 #include <kern/thread.h>
  77 #include <kern/host.h>
  78
  79 #include <machine/limits.h>
  80
  81 #include <net/if.h>
  82
  83 #include <nfs/rpcv2.h>
  84 #include <nfs/nfsproto.h>
  85 #include <nfs/nfs.h>
  86 #include <nfs/nfs_gss.h>
  87 #include <nfs/nfsmount.h>
  88 #include <nfs/nfsnode.h>
  89 #include <nfs/nfs_lock.h>
  90
  91 #include <mach/host_priv.h>
  92 #include <mach/mig_errors.h>
  93 #include <mach/host_special_ports.h>
  94 #include <lockd/lockd_mach.h>
  95
  96 extern void ipc_port_release_send(ipc_port_t);
  97
  98 #define OFF_MAX QUAD_MAX
  99
 100 /*
 101  * pending lock request messages are kept in this queue which is
 102  * kept sorted by transaction ID (xid).
 103  */
 104 static uint64_t nfs_lockxid = 0;
 105 static LOCKD_MSG_QUEUE nfs_pendlockq;
 106
 107 /*
 108  * This structure is used to identify processes which have acquired NFS locks.
 109  * Knowing which processes have ever acquired locks allows us to short-circuit
 110  * unlock requests for processes that have never had an NFS file lock.  Thus
 111  * avoiding a costly and unnecessary lockd request.
 112  */
 113 struct nfs_lock_pid {
 114         TAILQ_ENTRY(nfs_lock_pid)       lp_lru;         /* LRU list */
 115         LIST_ENTRY(nfs_lock_pid)        lp_hash;        /* hash chain */
 116         int                             lp_valid;       /* valid entry? */
 117         int                             lp_time;        /* last time seen valid */
 118         pid_t                           lp_pid;         /* The process ID. */
 119         struct timeval                  lp_pid_start;   /* Start time of process id */
 120 };
 121
 122 #define NFS_LOCK_PID_HASH_SIZE          64      // XXX tune me
 123 #define NFS_LOCK_PID_HASH(pid)  \
 124         (&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash])
 125 static LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl;
 126 static TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru;
 127 static u_long nfs_lock_pid_hash, nfs_lock_pid_hash_trusted;
 128
 129 static lck_grp_t *nfs_lock_lck_grp;
 130 static lck_mtx_t *nfs_lock_mutex;
 131
 132
 133 /*
 134  * initialize global nfs lock state
 135  */
 136 void
 137 nfs_lockinit(void)
 138 {
 139         TAILQ_INIT(&nfs_pendlockq);
 140         nfs_lock_pid_hash_trusted = 1;
 141         nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE,
 142                                          M_TEMP, &nfs_lock_pid_hash);
 143         TAILQ_INIT(&nfs_lock_pid_lru);
 144
 145         nfs_lock_lck_grp = lck_grp_alloc_init("nfs_lock", LCK_GRP_ATTR_NULL);
 146         nfs_lock_mutex = lck_mtx_alloc_init(nfs_lock_lck_grp, LCK_ATTR_NULL);
 147 }
 148
 149 /*
 150  * change the count of NFS mounts that may need to make lockd requests
 151  *
 152  * If the mount count drops to zero, then send a shutdown request to
 153  * lockd if we've sent any requests to it.
 154  */
 155 void
 156 nfs_lockd_mount_change(int i)
 157 {
 158         mach_port_t lockd_port = IPC_PORT_NULL;
 159         kern_return_t kr;
 160         int send_shutdown;
 161
 162         lck_mtx_lock(nfs_lock_mutex);
 163
 164         nfs_lockd_mounts += i;
 165
 166         /* send a shutdown request if there are no more lockd mounts */
 167         send_shutdown = ((nfs_lockd_mounts == 0) && nfs_lockd_request_sent);
 168         if (send_shutdown)
 169                 nfs_lockd_request_sent = 0;
 170
 171         lck_mtx_unlock(nfs_lock_mutex);
 172
 173         if (!send_shutdown)
 174                 return;
 175
 176         /*
 177          * Let lockd know that it is no longer need for any NFS mounts
 178          */
 179         kr = host_get_lockd_port(host_priv_self(), &lockd_port);
 180         if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(lockd_port)) {
 181                 printf("nfs_lockd_mount_change: shutdown couldn't get port, kr %d, port %s\n",
 182                         kr, (lockd_port == IPC_PORT_NULL) ? "NULL" :
 183                         (lockd_port == IPC_PORT_DEAD) ? "DEAD" : "VALID");
 184                 return;
 185         }
 186
 187         kr = lockd_shutdown(lockd_port);
 188         if (kr != KERN_SUCCESS)
 189                 printf("nfs_lockd_mount_change: shutdown %d\n", kr);
 190
 191         ipc_port_release_send(lockd_port);
 192 }
 193
 194 /*
 195  * insert a lock request message into the pending queue
 196  * (nfs_lock_mutex must be held)
 197  */
 198 static inline void
 199 nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq)
 200 {
 201         LOCKD_MSG_REQUEST *mr;
 202
 203         mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue);
 204         if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 205                 /* fast path: empty queue or new largest xid */
 206                 TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next);
 207                 return;
 208         }
 209         /* slow path: need to walk list to find insertion point */
 210         while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 211                 mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next);
 212         }
 213         if (mr) {
 214                 TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next);
 215         } else {
 216                 TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next);
 217         }
 218 }
 219
 220 /*
 221  * remove a lock request message from the pending queue
 222  * (nfs_lock_mutex must be held)
 223  */
 224 static inline void
 225 nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq)
 226 {
 227         TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next);
 228 }
 229
 230 /*
 231  * find a pending lock request message by xid
 232  *
 233  * We search from the head of the list assuming that the message we're
 234  * looking for is for an older request (because we have an answer to it).
 235  * This assumes that lock request will be answered primarily in FIFO order.
 236  * However, this may not be the case if there are blocked requests.  We may
 237  * want to move blocked requests to a separate queue (but that'll complicate
 238  * duplicate xid checking).
 239  *
 240  * (nfs_lock_mutex must be held)
 241  */
 242 static inline LOCKD_MSG_REQUEST *
 243 nfs_lockdmsg_find_by_xid(uint64_t lockxid)
 244 {
 245         LOCKD_MSG_REQUEST *mr;
 246
 247         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 248                 if (mr->lmr_msg.lm_xid == lockxid)
 249                         return mr;
 250                 if (mr->lmr_msg.lm_xid > lockxid)
 251                         return NULL;
 252         }
 253         return mr;
 254 }
 255
 256 /*
 257  * Because we can't depend on nlm_granted messages containing the same
 258  * cookie we sent with the original lock request, we need code test if
 259  * an nlm_granted answer matches the lock request.  We also need code
 260  * that can find a lockd message based solely on the nlm_granted answer.
 261  */
 262
 263 /*
 264  * compare lockd message to answer
 265  *
 266  * returns 0 on equality and 1 if different
 267  */
 268 static inline int
 269 nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp)
 270 {
 271         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 272                 return 1;
 273         if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid)
 274                 return 1;
 275         if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start)
 276                 return 1;
 277         if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len)
 278                 return 1;
 279         if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len)
 280                 return 1;
 281         if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len))
 282                 return 1;
 283         return 0;
 284 }
 285
 286 /*
 287  * find a pending lock request message based on the lock info provided
 288  * in the lockd_ans/nlm_granted data.  We need this because we can't
 289  * depend on nlm_granted messages containing the same cookie we sent
 290  * with the original lock request.
 291  *
 292  * We search from the head of the list assuming that the message we're
 293  * looking for is for an older request (because we have an answer to it).
 294  * This assumes that lock request will be answered primarily in FIFO order.
 295  * However, this may not be the case if there are blocked requests.  We may
 296  * want to move blocked requests to a separate queue (but that'll complicate
 297  * duplicate xid checking).
 298  *
 299  * (nfs_lock_mutex must be held)
 300  */
 301 static inline LOCKD_MSG_REQUEST *
 302 nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp)
 303 {
 304         LOCKD_MSG_REQUEST *mr;
 305
 306         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 307                 return NULL;
 308         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 309                 if (!nfs_lockdmsg_compare_to_answer(mr, ansp))
 310                         break;
 311         }
 312         return mr;
 313 }
 314
 315 /*
 316  * return the next unique lock request transaction ID
 317  * (nfs_lock_mutex must be held)
 318  */
 319 static inline uint64_t
 320 nfs_lockxid_get(void)
 321 {
 322         LOCKD_MSG_REQUEST *mr;
 323
 324         /* derive initial lock xid from system time */
 325         if (!nfs_lockxid) {
 326                 /*
 327                  * Note: it's OK if this code inits nfs_lockxid to 0 (for example,
 328                  * due to a broken clock) because we immediately increment it
 329                  * and we guarantee to never use xid 0.  So, nfs_lockxid should only
 330                  * ever be 0 the first time this function is called.
 331                  */
 332                 struct timeval tv;
 333                 microtime(&tv);
 334                 nfs_lockxid = (uint64_t)tv.tv_sec << 12;
 335         }
 336
 337         /* make sure we get a unique xid */
 338         do {
 339                 /* Skip zero xid if it should ever happen.  */
 340                 if (++nfs_lockxid == 0)
 341                         nfs_lockxid++;
 342                 if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) ||
 343                      (mr->lmr_msg.lm_xid < nfs_lockxid)) {
 344                         /* fast path: empty queue or new largest xid */
 345                         break;
 346                 }
 347                 /* check if xid is already in use */
 348         } while (nfs_lockdmsg_find_by_xid(nfs_lockxid));
 349
 350         return nfs_lockxid;
 351 }
 352
 353
 354 /*
 355  * Check the nfs_lock_pid hash table for an entry and, if requested,
 356  * add the entry if it is not found.
 357  *
 358  * (Also, if adding, try to clean up some stale entries.)
 359  * (nfs_lock_mutex must be held)
 360  */
 361 static int
 362 nfs_lock_pid_check(proc_t p, int addflag)
 363 {
 364         struct nfs_lock_pid *lp, *lplru, *lplru_next, *mlp;
 365         TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_free;
 366         proc_t plru = PROC_NULL;
 367         pid_t pid;
 368         int error = 0;
 369         struct timeval now;
 370
 371         TAILQ_INIT(&nfs_lock_pid_free);
 372         mlp = NULL;
 373
 374 loop:
 375         /* Search hash chain */
 376         pid = proc_pid(p);
 377         error = ENOENT;
 378         lp = NFS_LOCK_PID_HASH(pid)->lh_first;
 379         for (; lp != NULL; lp = lp->lp_hash.le_next)
 380                 if (lp->lp_pid == pid) {
 381                         /* found pid... */
 382                         if (timevalcmp(&lp->lp_pid_start, &p->p_start, ==)) {
 383                                 /* ...and it's valid */
 384                                 /* move to tail of LRU */
 385                                 TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 386                                 microuptime(&now);
 387                                 lp->lp_time = now.tv_sec;
 388                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 389                                 error = 0;
 390                                 break;
 391                         }
 392                         /* ...but it's no longer valid */
 393                         /* remove from hash, invalidate, and move to lru head */
 394                         LIST_REMOVE(lp, lp_hash);
 395                         lp->lp_valid = 0;
 396                         TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 397                         TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru);
 398                         lp = NULL;
 399                         break;
 400                 }
 401
 402         /* if we didn't find it (valid), use any newly allocated one */
 403         if (!lp)
 404                 lp = mlp;
 405
 406         /* if we don't have an lp and we've been asked to add it */
 407         if ((error == ENOENT) && addflag && !lp) {
 408                 /* scan lru list for invalid, stale entries to reuse/free */
 409                 int lrucnt = 0;
 410                 microuptime(&now);
 411                 for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) {
 412                         lplru_next = TAILQ_NEXT(lplru, lp_lru);
 413                         if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) {
 414                                 /*
 415                                  * If the oldest LRU entry is relatively new, then don't
 416                                  * bother scanning any further.
 417                                  */
 418                                 break;
 419                         }
 420                         /* remove entry from LRU, and check if it's still in use */
 421                         TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru);
 422                         if (!lplru->lp_valid || !(plru = proc_find(lplru->lp_pid)) ||
 423                             timevalcmp(&lplru->lp_pid_start, &plru->p_start, !=)) {
 424                                 if (plru != PROC_NULL) {
 425                                         proc_rele(plru);
 426                                         plru = PROC_NULL;
 427                                 }
 428                                 /* no longer in use */
 429                                 LIST_REMOVE(lplru, lp_hash);
 430                                 if (!lp) {
 431                                         /* we'll reuse this one */
 432                                         lp = lplru;
 433                                 } else {
 434                                         /* queue it up for freeing */
 435                                         TAILQ_INSERT_HEAD(&nfs_lock_pid_free, lplru, lp_lru);
 436                                 }
 437                         } else {
 438                                 /* still in use */
 439                                 if (plru != PROC_NULL) {
 440                                         proc_rele(plru);
 441                                         plru = PROC_NULL;
 442                                 }
 443                                 lplru->lp_time = now.tv_sec;
 444                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru);
 445                         }
 446                         /* don't check too many entries at once */
 447                         if (++lrucnt > 8)
 448                                 break;
 449                 }
 450                 if (!lp) {
 451                         /* we need to allocate a new one */
 452                         lck_mtx_unlock(nfs_lock_mutex);
 453                         MALLOC(mlp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid),
 454                                 M_TEMP, M_WAITOK | M_ZERO);
 455                         lck_mtx_lock(nfs_lock_mutex);
 456                         if (mlp) /* make sure somebody hasn't already added this guy */
 457                                 goto loop;
 458                         error = ENOMEM;
 459                 }
 460         }
 461         if ((error == ENOENT) && addflag && lp) {
 462                 /* (re)initialize nfs_lock_pid info */
 463                 lp->lp_pid = pid;
 464                 lp->lp_pid_start = p->p_start;
 465                 /* insert pid in hash */
 466                 LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash);
 467                 lp->lp_valid = 1;
 468                 lp->lp_time = now.tv_sec;
 469                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 470                 error = 0;
 471         }
 472
 473         if ((mlp && (lp != mlp)) || TAILQ_FIRST(&nfs_lock_pid_free)) {
 474                 lck_mtx_unlock(nfs_lock_mutex);
 475                 if (mlp && (lp != mlp)) {
 476                         /* we didn't need this one, so we can free it */
 477                         FREE(mlp, M_TEMP);
 478                 }
 479                 /* free up any stale entries */
 480                 while ((lp = TAILQ_FIRST(&nfs_lock_pid_free))) {
 481                         TAILQ_REMOVE(&nfs_lock_pid_free, lp, lp_lru);
 482                         FREE(lp, M_TEMP);
 483                 }
 484                 lck_mtx_lock(nfs_lock_mutex);
 485         }
 486
 487         return (error);
 488 }
 489
 490 #define MACH_MAX_TRIES 3
 491
 492 static int
 493 send_request(LOCKD_MSG *msg, int interruptable)
 494 {
 495         kern_return_t kr;
 496         int retries = 0;
 497         mach_port_t lockd_port = IPC_PORT_NULL;
 498
 499         kr = host_get_lockd_port(host_priv_self(), &lockd_port);
 500         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(lockd_port))
 501                 return (ENOTSUP);
 502
 503         do {
 504                 /* In the kernel all mach messaging is interruptable */
 505                 do {
 506                         kr = lockd_request(
 507                                 lockd_port,
 508                                 msg->lm_version,
 509                                 msg->lm_flags,
 510                                 msg->lm_xid,
 511                                 msg->lm_fl.l_start,
 512                                 msg->lm_fl.l_len,
 513                                 msg->lm_fl.l_pid,
 514                                 msg->lm_fl.l_type,
 515                                 msg->lm_fl.l_whence,
 516                                 (uint32_t *)&msg->lm_addr,
 517                                 (uint32_t *)&msg->lm_cred,
 518                                 msg->lm_fh_len,
 519                                 msg->lm_fh);
 520                         if (kr != KERN_SUCCESS)
 521                                 printf("lockd_request received %d!\n", kr);
 522                 } while (!interruptable && kr == MACH_SEND_INTERRUPTED);
 523         } while (kr == MIG_SERVER_DIED && retries++ < MACH_MAX_TRIES);
 524
 525         ipc_port_release_send(lockd_port);
 526         switch (kr) {
 527         case MACH_SEND_INTERRUPTED:
 528                 return (EINTR);
 529         default:
 530                 /*
 531                  * Other MACH or MIG errors we will retry. Eventually
 532                  * we will call nfs_down and allow the user to disable
 533                  * locking.
 534                  */
 535                 return (EAGAIN);
 536         }
 537         return (kr);
 538 }
 539
 540
 541 /*
 542  * NFS advisory byte-level locks (client)
 543  */
 544 int
 545 nfs3_vnop_advlock(
 546         struct vnop_advlock_args /* {
 547                 struct vnodeop_desc *a_desc;
 548                 vnode_t a_vp;
 549                 caddr_t a_id;
 550                 int a_op;
 551                 struct flock *a_fl;
 552                 int a_flags;
 553                 vfs_context_t a_context;
 554         } */ *ap)
 555 {
 556         vfs_context_t ctx;
 557         proc_t p;
 558         LOCKD_MSG_REQUEST msgreq;
 559         LOCKD_MSG *msg;
 560         vnode_t vp;
 561         nfsnode_t np;
 562         int error, error2;
 563         int interruptable;
 564         struct flock *fl;
 565         struct nfsmount *nmp;
 566         struct nfs_vattr nvattr;
 567         off_t start, end;
 568         struct timeval now;
 569         int timeo, endtime, lastmsg, wentdown = 0;
 570         int lockpidcheck, nfsvers;
 571         struct sockaddr *saddr;
 572         struct timespec ts;
 573
 574         ctx = ap->a_context;
 575         p = vfs_context_proc(ctx);
 576         vp = ap->a_vp;
 577         fl = ap->a_fl;
 578         np = VTONFS(vp);
 579
 580         nmp = VTONMP(vp);
 581         if (!nmp)
 582                 return (ENXIO);
 583         lck_mtx_lock(&nmp->nm_lock);
 584         if (nmp->nm_flag & NFSMNT_NOLOCKS) {
 585                 lck_mtx_unlock(&nmp->nm_lock);
 586                 return (ENOTSUP);
 587         }
 588         nfsvers = nmp->nm_vers;
 589         lck_mtx_unlock(&nmp->nm_lock);
 590
 591         /*
 592          * The NLM protocol doesn't allow the server to return an error
 593          * on ranges, so we do it.  Pre LFS (Large File Summit)
 594          * standards required EINVAL for the range errors.  More recent
 595          * standards use EOVERFLOW, but their EINVAL wording still
 596          * encompasses these errors.
 597          * Any code sensitive to this is either:
 598          *  1) written pre-LFS and so can handle only EINVAL, or
 599          *  2) written post-LFS and thus ought to be tolerant of pre-LFS
 600          *     implementations.
 601          * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
 602          */
 603         if (fl->l_whence != SEEK_END) {
 604                 if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
 605                     fl->l_start < 0 ||
 606                     (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
 607                     (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
 608                         return (EINVAL);
 609         }
 610
 611         lck_mtx_lock(nfs_lock_mutex);
 612
 613         /*
 614          * Need to check if this process has successfully acquired an NFS lock before.
 615          * If not, and this is an unlock request we can simply return success here.
 616          */
 617         lockpidcheck = nfs_lock_pid_check(p, 0);
 618         lck_mtx_unlock(nfs_lock_mutex);
 619         if (lockpidcheck) {
 620                 if (lockpidcheck != ENOENT)
 621                         return (lockpidcheck);
 622                 if ((ap->a_op == F_UNLCK) && nfs_lock_pid_hash_trusted)
 623                         return (0);
 624         }
 625
 626         /*
 627          * The NFS Lock Manager protocol doesn't directly handle
 628          * negative lengths or SEEK_END, so we need to normalize
 629          * things here where we have all the info.
 630          * (Note: SEEK_CUR is already adjusted for at this point)
 631          */
 632         /* Convert the flock structure into a start and end. */
 633         switch (fl->l_whence) {
 634         case SEEK_SET:
 635         case SEEK_CUR:
 636                 /*
 637                  * Caller is responsible for adding any necessary offset
 638                  * to fl->l_start when SEEK_CUR is used.
 639                  */
 640                 start = fl->l_start;
 641                 break;
 642         case SEEK_END:
 643                 /* need to flush, and refetch attributes to make */
 644                 /* sure we have the correct end of file offset   */
 645                 error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
 646                 if (error)
 647                         return (error);
 648                 NATTRINVALIDATE(np);
 649                 if (np->n_flag & NMODIFIED) {
 650                         nfs_unlock(np);
 651                         error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
 652                         if (error)
 653                                 return (error);
 654                 } else
 655                         nfs_unlock(np);
 656
 657                 error = nfs_getattr(np, &nvattr, ctx, 0);
 658                 nfs_data_lock(np, NFS_NODE_LOCK_SHARED);
 659                 if (!error)
 660                         error = nfs_lock(np, NFS_NODE_LOCK_SHARED);
 661                 if (error) {
 662                         nfs_data_unlock(np);
 663                         return (error);
 664                 }
 665                 start = np->n_size + fl->l_start;
 666                 nfs_unlock(np);
 667                 nfs_data_unlock(np);
 668                 break;
 669         default:
 670                 return (EINVAL);
 671         }
 672         if (fl->l_len == 0)
 673                 end = -1;
 674         else if (fl->l_len > 0)
 675                 end = start + fl->l_len - 1;
 676         else { /* l_len is negative */
 677                 end = start - 1;
 678                 start += fl->l_len;
 679         }
 680         if (start < 0)
 681                 return (EINVAL);
 682
 683         if ((nfsvers == NFS_VER2) &&
 684             ((start >= 0x80000000) || (end >= 0x80000000)))
 685                 return (EINVAL);
 686
 687         /*
 688          * Fill in the information structure.
 689          */
 690         msgreq.lmr_answered = 0;
 691         msgreq.lmr_errno = 0;
 692         msgreq.lmr_saved_errno = 0;
 693         msg = &msgreq.lmr_msg;
 694         msg->lm_version = LOCKD_MSG_VERSION;
 695         msg->lm_flags = 0;
 696
 697         msg->lm_fl = *fl;
 698         msg->lm_fl.l_start = start;
 699         if (end != -1)
 700                 msg->lm_fl.l_len = end - start + 1;
 701         msg->lm_fl.l_pid = vfs_context_pid(ctx);
 702
 703         if (ap->a_flags & F_WAIT)
 704                 msg->lm_flags |= LOCKD_MSG_BLOCK;
 705         if (ap->a_op == F_GETLK)
 706                 msg->lm_flags |= LOCKD_MSG_TEST;
 707
 708         nmp = VTONMP(vp);
 709         if (!nmp)
 710                 return (ENXIO);
 711
 712         lck_mtx_lock(&nmp->nm_lock);
 713         saddr = mbuf_data(nmp->nm_nam);
 714         bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len));
 715         msg->lm_fh_len = (nfsvers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize;
 716         bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len);
 717         if (nfsvers == NFS_VER3)
 718                 msg->lm_flags |= LOCKD_MSG_NFSV3;
 719         cru2x(vfs_context_ucred(ctx), &msg->lm_cred);
 720
 721         microuptime(&now);
 722         lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
 723         interruptable = nmp->nm_flag & NFSMNT_INT;
 724         lck_mtx_unlock(&nmp->nm_lock);
 725
 726         lck_mtx_lock(nfs_lock_mutex);
 727
 728         /* allocate unique xid */
 729         msg->lm_xid = nfs_lockxid_get();
 730         nfs_lockdmsg_enqueue(&msgreq);
 731
 732         timeo = 2;
 733
 734         for (;;) {
 735                 nfs_lockd_request_sent = 1;
 736
 737                 /* need to drop nfs_lock_mutex while calling send_request() */
 738                 lck_mtx_unlock(nfs_lock_mutex);
 739                 error = send_request(msg, interruptable);
 740                 lck_mtx_lock(nfs_lock_mutex);
 741                 if (error && error != EAGAIN)
 742                         break;
 743
 744                 /*
 745                  * Always wait for an answer.  Not waiting for unlocks could
 746                  * cause a lock to be left if the unlock request gets dropped.
 747                  */
 748
 749                 /*
 750                  * Retry if it takes too long to get a response.
 751                  *
 752                  * The timeout numbers were picked out of thin air... they start
 753                  * at 2 and double each timeout with a max of 60 seconds.
 754                  *
 755                  * In order to maintain responsiveness, we pass a small timeout
 756                  * to msleep and calculate the timeouts ourselves.  This allows
 757                  * us to pick up on mount changes quicker.
 758                  */
 759 wait_for_granted:
 760                 error = EWOULDBLOCK;
 761                 ts.tv_sec = 2;
 762                 ts.tv_nsec = 0;
 763                 microuptime(&now);
 764                 endtime = now.tv_sec + timeo;
 765                 while (now.tv_sec < endtime) {
 766                         error = error2 = 0;
 767                         if (!msgreq.lmr_answered)
 768                                 error = msleep(&msgreq, nfs_lock_mutex, PCATCH | PUSER, "lockd", &ts);
 769                         if (msgreq.lmr_answered) {
 770                                 /*
 771                                  * Note: it's possible to have a lock granted at
 772                                  * essentially the same time that we get interrupted.
 773                                  * Since the lock may be granted, we can't return an
 774                                  * error from this request or we might not unlock the
 775                                  * lock that's been granted.
 776                                  */
 777                                 nmp = VTONMP(vp);
 778                                 if ((msgreq.lmr_errno == ENOTSUP) && nmp &&
 779                                     (nmp->nm_state & NFSSTA_LOCKSWORK)) {
 780                                         /*
 781                                          * We have evidence that locks work, yet lockd
 782                                          * returned ENOTSUP.  This is probably because
 783                                          * it was unable to contact the server's lockd
 784                                          * to send it the request.
 785                                          *
 786                                          * Because we know locks work, we'll consider
 787                                          * this failure to be a timeout.
 788                                          */
 789                                         error = EWOULDBLOCK;
 790                                 } else {
 791                                         error = 0;
 792                                 }
 793                                 break;
 794                         }
 795                         if (error != EWOULDBLOCK)
 796                                 break;
 797                         /* check that we still have our mount... */
 798                         /* ...and that we still support locks */
 799                         nmp = VTONMP(vp);
 800                         if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) {
 801                                 error = error2;
 802                                 if (fl->l_type == F_UNLCK)
 803                                         printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error);
 804                                 break;
 805                         }
 806                         lck_mtx_lock(&nmp->nm_lock);
 807                         if (nmp->nm_flag & NFSMNT_NOLOCKS) {
 808                                 lck_mtx_unlock(&nmp->nm_lock);
 809                                 break;
 810                         }
 811                         interruptable = nmp->nm_flag & NFSMNT_INT;
 812                         lck_mtx_unlock(&nmp->nm_lock);
 813                         microuptime(&now);
 814                 }
 815                 if (error) {
 816                         /* check that we still have our mount... */
 817                         nmp = VTONMP(vp);
 818                         if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) {
 819                                 error = error2;
 820                                 if (error2 != EINTR) {
 821                                         if (fl->l_type == F_UNLCK)
 822                                                 printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error);
 823                                         break;
 824                                 }
 825                         }
 826                         /* ...and that we still support locks */
 827                         lck_mtx_lock(&nmp->nm_lock);
 828                         if (nmp->nm_flag & NFSMNT_NOLOCKS) {
 829                                 if (error == EWOULDBLOCK)
 830                                         error = ENOTSUP;
 831                                 lck_mtx_unlock(&nmp->nm_lock);
 832                                 break;
 833                         }
 834                         interruptable = nmp->nm_flag & NFSMNT_INT;
 835                         if (error != EWOULDBLOCK) {
 836                                 lck_mtx_unlock(&nmp->nm_lock);
 837                                 /*
 838                                  * We're going to bail on this request.
 839                                  * If we were a blocked lock request, send a cancel.
 840                                  */
 841                                 if ((msgreq.lmr_errno == EINPROGRESS) &&
 842                                     !(msg->lm_flags & LOCKD_MSG_CANCEL)) {
 843                                         /* set this request up as a cancel */
 844                                         msg->lm_flags |= LOCKD_MSG_CANCEL;
 845                                         nfs_lockdmsg_dequeue(&msgreq);
 846                                         msg->lm_xid = nfs_lockxid_get();
 847                                         nfs_lockdmsg_enqueue(&msgreq);
 848                                         msgreq.lmr_saved_errno = error;
 849                                         msgreq.lmr_errno = 0;
 850                                         msgreq.lmr_answered = 0;
 851                                         /* reset timeout */
 852                                         timeo = 2;
 853                                         /* send cancel request */
 854                                         continue;
 855                                 }
 856                                 break;
 857                         }
 858
 859                         /* warn if we're not getting any response */
 860                         microuptime(&now);
 861                         if ((msgreq.lmr_errno != EINPROGRESS) &&
 862                             (nmp->nm_tprintf_initial_delay != 0) &&
 863                             ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
 864                                 lck_mtx_unlock(&nmp->nm_lock);
 865                                 lastmsg = now.tv_sec;
 866                                 nfs_down(nmp, vfs_context_thread(ctx), 0, NFSSTA_LOCKTIMEO, "lockd not responding");
 867                                 wentdown = 1;
 868                         } else
 869                                 lck_mtx_unlock(&nmp->nm_lock);
 870
 871                         if (msgreq.lmr_errno == EINPROGRESS) {
 872                                 /*
 873                                  * We've got a blocked lock request that we are
 874                                  * going to retry.  First, we'll want to try to
 875                                  * send a cancel for the previous request.
 876                                  *
 877                                  * Clear errno so if we don't get a response
 878                                  * to the resend we'll call nfs_down().
 879                                  * Also reset timeout because we'll expect a
 880                                  * quick response to the cancel/resend (even if
 881                                  * it is NLM_BLOCKED).
 882                                  */
 883                                 msg->lm_flags |= LOCKD_MSG_CANCEL;
 884                                 nfs_lockdmsg_dequeue(&msgreq);
 885                                 msg->lm_xid = nfs_lockxid_get();
 886                                 nfs_lockdmsg_enqueue(&msgreq);
 887                                 msgreq.lmr_saved_errno = msgreq.lmr_errno;
 888                                 msgreq.lmr_errno = 0;
 889                                 msgreq.lmr_answered = 0;
 890                                 timeo = 2;
 891                                 /* send cancel then resend request */
 892                                 continue;
 893                         }
 894                         /*
 895                          * We timed out, so we will resend the request.
 896                          */
 897                         timeo *= 2;
 898                         if (timeo > 60)
 899                                 timeo = 60;
 900                         /* resend request */
 901                         continue;
 902                 }
 903
 904                 /* we got a reponse, so the server's lockd is OK */
 905                 nfs_up(VTONMP(vp), vfs_context_thread(ctx), NFSSTA_LOCKTIMEO,
 906                         wentdown ? "lockd alive again" : NULL);
 907                 wentdown = 0;
 908
 909                 if (msgreq.lmr_errno == EINPROGRESS) {
 910                         /* got NLM_BLOCKED response */
 911                         /* need to wait for NLM_GRANTED */
 912                         timeo = 60;
 913                         msgreq.lmr_answered = 0;
 914                         goto wait_for_granted;
 915                 }
 916
 917                 if ((msg->lm_flags & LOCKD_MSG_CANCEL) &&
 918                     (msgreq.lmr_saved_errno == EINPROGRESS)) {
 919                         /*
 920                          * We just got a successful reply to the
 921                          * cancel of the previous blocked lock request.
 922                          * Now, go ahead and resend the request.
 923                          */
 924                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 925                         nfs_lockdmsg_dequeue(&msgreq);
 926                         msg->lm_xid = nfs_lockxid_get();
 927                         nfs_lockdmsg_enqueue(&msgreq);
 928                         msgreq.lmr_saved_errno = 0;
 929                         msgreq.lmr_errno = 0;
 930                         msgreq.lmr_answered = 0;
 931                         timeo = 2;
 932                         /* resend request */
 933                         continue;
 934                 }
 935
 936                 if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) {
 937                         if (msg->lm_fl.l_type != F_UNLCK) {
 938                                 fl->l_type = msg->lm_fl.l_type;
 939                                 fl->l_pid = msg->lm_fl.l_pid;
 940                                 fl->l_start = msg->lm_fl.l_start;
 941                                 fl->l_len = msg->lm_fl.l_len;
 942                                 fl->l_whence = SEEK_SET;
 943                         } else
 944                                 fl->l_type = F_UNLCK;
 945                 }
 946
 947                 /*
 948                  * If the blocked lock request was cancelled.
 949                  * Restore the error condition from when we
 950                  * originally bailed on the request.
 951                  */
 952                 if (msg->lm_flags & LOCKD_MSG_CANCEL) {
 953                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 954                         error = msgreq.lmr_saved_errno;
 955                 } else
 956                         error = msgreq.lmr_errno;
 957
 958                 nmp = VTONMP(vp);
 959                 if ((error == ENOTSUP) && nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK)) {
 960                         /*
 961                          * We have NO evidence that locks work and lockd
 962                          * returned ENOTSUP.  Let's take this as a hint
 963                          * that locks aren't supported and disable them
 964                          * for this mount.
 965                          */
 966                         lck_mtx_lock(&nmp->nm_lock);
 967                         nmp->nm_flag |= NFSMNT_NOLOCKS;
 968                         nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
 969                         lck_mtx_unlock(&nmp->nm_lock);
 970                         printf("lockd returned ENOTSUP, disabling locks for nfs server: %s\n",
 971                                 vfs_statfs(nmp->nm_mountp)->f_mntfromname);
 972                 }
 973                 if (!error) {
 974                         /* record that NFS file locking has worked on this mount */
 975                         if (nmp) {
 976                                 lck_mtx_lock(&nmp->nm_lock);
 977                                 if (!(nmp->nm_state & NFSSTA_LOCKSWORK))
 978                                         nmp->nm_state |= NFSSTA_LOCKSWORK;
 979                                 lck_mtx_unlock(&nmp->nm_lock);
 980                         }
 981                         /*
 982                          * If we successfully acquired a lock, make sure this pid
 983                          * is in the nfs_lock_pid hash table so we know we can't
 984                          * short-circuit unlock requests.
 985                          */
 986                         if ((lockpidcheck == ENOENT) &&
 987                             ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW))) {
 988                                 error = nfs_lock_pid_check(p, 1);
 989                                 if (error) {
 990                                         /*
 991                                          * We couldn't add the pid to the table,
 992                                          * so we can no longer trust that a pid
 993                                          * not in the table has no locks.
 994                                          */
 995                                         nfs_lock_pid_hash_trusted = 0;
 996                                         printf("nfs_vnop_advlock: pid add failed - no longer trusted\n");
 997                                 }
 998                         }
 999                 }
1000                 break;
1001         }
1002
1003         nfs_lockdmsg_dequeue(&msgreq);
1004
1005         lck_mtx_unlock(nfs_lock_mutex);
1006
1007         return (error);
1008 }
1009
1010 /*
1011  * nfslockdans --
1012  *      NFS advisory byte-level locks answer from the lock daemon.
1013  */
1014 int
1015 nfslockdans(proc_t p, struct lockd_ans *ansp)
1016 {
1017         LOCKD_MSG_REQUEST *msgreq;
1018         int error;
1019
1020         /* Let root make this call. */
1021         error = proc_suser(p);
1022         if (error)
1023                 return (error);
1024
1025         /* the version should match, or we're out of sync */
1026         if (ansp->la_version != LOCKD_ANS_VERSION)
1027                 return (EINVAL);
1028
1029         lck_mtx_lock(nfs_lock_mutex);
1030
1031         /* try to find the lockd message by transaction id (cookie) */
1032         msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
1033         if (ansp->la_flags & LOCKD_ANS_GRANTED) {
1034                 /*
1035                  * We can't depend on the granted message having our cookie,
1036                  * so we check the answer against the lockd message found.
1037                  * If no message was found or it doesn't match the answer,
1038                  * we look for the lockd message by the answer's lock info.
1039                  */
1040                 if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp))
1041                         msgreq = nfs_lockdmsg_find_by_answer(ansp);
1042                 /*
1043                  * We need to make sure this request isn't being cancelled
1044                  * If it is, we don't want to accept the granted message.
1045                  */
1046                 if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL))
1047                         msgreq = NULL;
1048         }
1049         if (!msgreq) {
1050                 lck_mtx_unlock(nfs_lock_mutex);
1051                 return (EPIPE);
1052         }
1053
1054         msgreq->lmr_errno = ansp->la_errno;
1055         if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) {
1056                 if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) {
1057                         if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL)
1058                                 msgreq->lmr_msg.lm_fl.l_type = F_WRLCK;
1059                         else
1060                                 msgreq->lmr_msg.lm_fl.l_type = F_RDLCK;
1061                         msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid;
1062                         msgreq->lmr_msg.lm_fl.l_start = ansp->la_start;
1063                         msgreq->lmr_msg.lm_fl.l_len = ansp->la_len;
1064                 } else {
1065                         msgreq->lmr_msg.lm_fl.l_type = F_UNLCK;
1066                 }
1067         }
1068
1069         msgreq->lmr_answered = 1;
1070         lck_mtx_unlock(nfs_lock_mutex);
1071         wakeup(msgreq);
1072
1073         return (0);
1074 }
1075