bsd/nfs/nfs_lock.c

   1 /*
   2  * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*-
  24  * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
  25  *
  26  * Redistribution and use in source and binary forms, with or without
  27  * modification, are permitted provided that the following conditions
  28  * are met:
  29  * 1. Redistributions of source code must retain the above copyright
  30  *    notice, this list of conditions and the following disclaimer.
  31  * 2. Redistributions in binary form must reproduce the above copyright
  32  *    notice, this list of conditions and the following disclaimer in the
  33  *    documentation and/or other materials provided with the distribution.
  34  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  35  *    promote products derived from this software without specific prior
  36  *    written permission.
  37  *
  38  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  41  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  48  * SUCH DAMAGE.
  49  *
  50  *      from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
  51  */
  52
  53 #include <sys/cdefs.h>
  54 #include <sys/param.h>
  55 #include <sys/systm.h>
  56 #include <sys/fcntl.h>
  57 #include <sys/kernel.h>         /* for hz */
  58 #include <sys/file_internal.h>
  59 #include <sys/malloc.h>
  60 #include <sys/lockf.h>          /* for hz */ /* Must come after sys/malloc.h */
  61 #include <sys/kpi_mbuf.h>
  62 #include <sys/mount_internal.h>
  63 #include <sys/proc_internal.h>  /* for p_start */
  64 #include <sys/kauth.h>
  65 #include <sys/resourcevar.h>
  66 #include <sys/socket.h>
  67 #include <sys/unistd.h>
  68 #include <sys/user.h>
  69 #include <sys/vnode_internal.h>
  70
  71 #include <kern/thread.h>
  72
  73 #include <machine/limits.h>
  74
  75 #include <net/if.h>
  76
  77 #include <nfs/rpcv2.h>
  78 #include <nfs/nfsproto.h>
  79 #include <nfs/nfs.h>
  80 #include <nfs/nfsmount.h>
  81 #include <nfs/nfsnode.h>
  82 #include <nfs/nfs_lock.h>
  83
  84 #define OFF_MAX QUAD_MAX
  85
  86 /*
  87  * globals for managing the lockd fifo
  88  */
  89 vnode_t nfslockdvnode = 0;
  90 int nfslockdwaiting = 0;
  91 time_t nfslockdstarttimeout = 0;
  92 int nfslockdfifolock = 0;
  93 #define NFSLOCKDFIFOLOCK_LOCKED 1
  94 #define NFSLOCKDFIFOLOCK_WANT   2
  95
  96 /*
  97  * pending lock request messages are kept in this queue which is
  98  * kept sorted by transaction ID (xid).
  99  */
 100 uint64_t nfs_lockxid = 0;
 101 LOCKD_MSG_QUEUE nfs_pendlockq;
 102
 103 /*
 104  * This structure is used to identify processes which have acquired NFS locks.
 105  * Knowing which processes have ever acquired locks allows us to short-circuit
 106  * unlock requests for processes that have never had an NFS file lock.  Thus
 107  * avoiding a costly and unnecessary lockd request.
 108  */
 109 struct nfs_lock_pid {
 110         TAILQ_ENTRY(nfs_lock_pid)       lp_lru;         /* LRU list */
 111         LIST_ENTRY(nfs_lock_pid)        lp_hash;        /* hash chain */
 112         int                             lp_valid;       /* valid entry? */
 113         int                             lp_time;        /* last time seen valid */
 114         pid_t                           lp_pid;         /* The process ID. */
 115         struct timeval                  lp_pid_start;   /* Start time of process id */
 116 };
 117
 118 #define NFS_LOCK_PID_HASH_SIZE          64      // XXX tune me
 119 #define NFS_LOCK_PID_HASH(pid)  \
 120         (&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash])
 121 LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl;
 122 TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru;
 123 u_long nfs_lock_pid_hash;
 124 int nfs_lock_pid_lock;
 125
 126
 127 /*
 128  * initialize global nfs lock state
 129  */
 130 void
 131 nfs_lockinit(void)
 132 {
 133         TAILQ_INIT(&nfs_pendlockq);
 134         nfs_lock_pid_lock = 0;
 135         nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE,
 136                                          M_TEMP, &nfs_lock_pid_hash);
 137         TAILQ_INIT(&nfs_lock_pid_lru);
 138 }
 139
 140 /*
 141  * insert a lock request message into the pending queue
 142  */
 143 static inline void
 144 nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq)
 145 {
 146         LOCKD_MSG_REQUEST *mr;
 147
 148         mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue);
 149         if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 150                 /* fast path: empty queue or new largest xid */
 151                 TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next);
 152                 return;
 153         }
 154         /* slow path: need to walk list to find insertion point */
 155         while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 156                 mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next);
 157         }
 158         if (mr) {
 159                 TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next);
 160         } else {
 161                 TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next);
 162         }
 163 }
 164
 165 /*
 166  * remove a lock request message from the pending queue
 167  */
 168 static inline void
 169 nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq)
 170 {
 171         TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next);
 172 }
 173
 174 /*
 175  * find a pending lock request message by xid
 176  *
 177  * We search from the head of the list assuming that the message we're
 178  * looking for is for an older request (because we have an answer to it).
 179  * This assumes that lock request will be answered primarily in FIFO order.
 180  * However, this may not be the case if there are blocked requests.  We may
 181  * want to move blocked requests to a separate queue (but that'll complicate
 182  * duplicate xid checking).
 183  */
 184 static inline LOCKD_MSG_REQUEST *
 185 nfs_lockdmsg_find_by_xid(uint64_t lockxid)
 186 {
 187         LOCKD_MSG_REQUEST *mr;
 188
 189         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 190                 if (mr->lmr_msg.lm_xid == lockxid)
 191                         return mr;
 192                 if (mr->lmr_msg.lm_xid > lockxid)
 193                         return NULL;
 194         }
 195         return mr;
 196 }
 197
 198 /*
 199  * Because we can't depend on nlm_granted messages containing the same
 200  * cookie we sent with the original lock request, we need code test if
 201  * an nlm_granted answer matches the lock request.  We also need code
 202  * that can find a lockd message based solely on the nlm_granted answer.
 203  */
 204
 205 /*
 206  * compare lockd message to answer
 207  *
 208  * returns 0 on equality and 1 if different
 209  */
 210 static inline int
 211 nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp)
 212 {
 213         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 214                 return 1;
 215         if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid)
 216                 return 1;
 217         if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start)
 218                 return 1;
 219         if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len)
 220                 return 1;
 221         if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len)
 222                 return 1;
 223         if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len))
 224                 return 1;
 225         return 0;
 226 }
 227
 228 /*
 229  * find a pending lock request message based on the lock info provided
 230  * in the lockd_ans/nlm_granted data.  We need this because we can't
 231  * depend on nlm_granted messages containing the same cookie we sent
 232  * with the original lock request.
 233  *
 234  * We search from the head of the list assuming that the message we're
 235  * looking for is for an older request (because we have an answer to it).
 236  * This assumes that lock request will be answered primarily in FIFO order.
 237  * However, this may not be the case if there are blocked requests.  We may
 238  * want to move blocked requests to a separate queue (but that'll complicate
 239  * duplicate xid checking).
 240  */
 241 static inline LOCKD_MSG_REQUEST *
 242 nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp)
 243 {
 244         LOCKD_MSG_REQUEST *mr;
 245
 246         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 247                 return NULL;
 248         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 249                 if (!nfs_lockdmsg_compare_to_answer(mr, ansp))
 250                         break;
 251         }
 252         return mr;
 253 }
 254
 255 /*
 256  * return the next unique lock request transaction ID
 257  */
 258 static inline uint64_t
 259 nfs_lockxid_get(void)
 260 {
 261         LOCKD_MSG_REQUEST *mr;
 262
 263         /* derive initial lock xid from system time */
 264         if (!nfs_lockxid) {
 265                 /*
 266                  * Note: it's OK if this code inits nfs_lockxid to 0 (for example,
 267                  * due to a broken clock) because we immediately increment it
 268                  * and we guarantee to never use xid 0.  So, nfs_lockxid should only
 269                  * ever be 0 the first time this function is called.
 270                  */
 271                 struct timeval tv;
 272                 microtime(&tv);
 273                 nfs_lockxid = (uint64_t)tv.tv_sec << 12;
 274         }
 275
 276         /* make sure we get a unique xid */
 277         do {
 278                 /* Skip zero xid if it should ever happen.  */
 279                 if (++nfs_lockxid == 0)
 280                         nfs_lockxid++;
 281                 if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) ||
 282                      (mr->lmr_msg.lm_xid < nfs_lockxid)) {
 283                         /* fast path: empty queue or new largest xid */
 284                         break;
 285                 }
 286                 /* check if xid is already in use */
 287         } while (nfs_lockdmsg_find_by_xid(nfs_lockxid));
 288
 289         return nfs_lockxid;
 290 }
 291
 292
 293 /*
 294  * Check the nfs_lock_pid hash table for an entry and, if requested,
 295  * add the entry if it is not found.
 296  *
 297  * (Also, if adding, try to clean up some stale entries.)
 298  */
 299 static int
 300 nfs_lock_pid_check(proc_t p, int addflag, vnode_t vp)
 301 {
 302         struct nfs_lock_pid *lp, *lplru, *lplru_next;
 303         proc_t plru;
 304         int error = 0;
 305         struct timeval now;
 306
 307         /* lock hash */
 308 loop:
 309         if (nfs_lock_pid_lock) {
 310                 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 311                 while (nfs_lock_pid_lock) {
 312                         nfs_lock_pid_lock = -1;
 313                         tsleep(&nfs_lock_pid_lock, PCATCH, "nfslockpid", 0);
 314                         if ((error = nfs_sigintr(nmp, NULL, p)))
 315                                 return (error);
 316                 }
 317                 goto loop;
 318         }
 319         nfs_lock_pid_lock = 1;
 320
 321         /* Search hash chain */
 322         error = ENOENT;
 323         lp = NFS_LOCK_PID_HASH(proc_pid(p))->lh_first;
 324         for (; lp != NULL; lp = lp->lp_hash.le_next)
 325                 if (lp->lp_pid == proc_pid(p)) {
 326                         /* found pid... */
 327                         if (timevalcmp(&lp->lp_pid_start, &p->p_stats->p_start, ==)) {
 328                                 /* ...and it's valid */
 329                                 /* move to tail of LRU */
 330                                 TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 331                                 microuptime(&now);
 332                                 lp->lp_time = now.tv_sec;
 333                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 334                                 error = 0;
 335                                 break;
 336                         }
 337                         /* ...but it's no longer valid */
 338                         /* remove from hash, invalidate, and move to lru head */
 339                         LIST_REMOVE(lp, lp_hash);
 340                         lp->lp_valid = 0;
 341                         TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 342                         TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru);
 343                         lp = NULL;
 344                         break;
 345                 }
 346
 347         /* if we didn't find it (valid) and we've been asked to add it */
 348         if ((error == ENOENT) && addflag) {
 349                 /* scan lru list for invalid, stale entries to reuse/free */
 350                 int lrucnt = 0;
 351                 microuptime(&now);
 352                 for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) {
 353                         lplru_next = TAILQ_NEXT(lplru, lp_lru);
 354                         if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) {
 355                                 /*
 356                                  * If the oldest LRU entry is relatively new, then don't
 357                                  * bother scanning any further.
 358                                  */
 359                                 break;
 360                         }
 361                         /* remove entry from LRU, and check if it's still in use */
 362                         TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru);
 363                         if (!lplru->lp_valid || !(plru = pfind(lplru->lp_pid)) ||
 364                             timevalcmp(&lplru->lp_pid_start, &plru->p_stats->p_start, !=)) {
 365                                 /* no longer in use */
 366                                 LIST_REMOVE(lplru, lp_hash);
 367                                 if (!lp) {
 368                                         /* we'll reuse this one */
 369                                         lp = lplru;
 370                                 } else {
 371                                         /* we can free this one */
 372                                         FREE(lplru, M_TEMP);
 373                                 }
 374                         } else {
 375                                 /* still in use */
 376                                 lplru->lp_time = now.tv_sec;
 377                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru);
 378                         }
 379                         /* don't check too many entries at once */
 380                         if (++lrucnt > 8)
 381                                 break;
 382                 }
 383                 if (!lp) {
 384                         /* we need to allocate a new one */
 385                         MALLOC(lp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid),
 386                                 M_TEMP, M_WAITOK | M_ZERO);
 387                 }
 388                 if (!lp) {
 389                         error = ENOMEM;
 390                 } else {
 391                         /* (re)initialize nfs_lock_pid info */
 392                         lp->lp_pid = proc_pid(p);
 393                         lp->lp_pid_start = p->p_stats->p_start;
 394                         /* insert pid in hash */
 395                         LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash);
 396                         lp->lp_valid = 1;
 397                         lp->lp_time = now.tv_sec;
 398                         TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 399                         error = 0;
 400                 }
 401         }
 402
 403         /* unlock hash */
 404         if (nfs_lock_pid_lock < 0) {
 405                 nfs_lock_pid_lock = 0;
 406                 wakeup(&nfs_lock_pid_lock);
 407         } else
 408                 nfs_lock_pid_lock = 0;
 409
 410         return (error);
 411 }
 412
 413
 414 /*
 415  * nfs_advlock --
 416  *      NFS advisory byte-level locks.
 417  */
 418 int
 419 nfs_dolock(struct vnop_advlock_args *ap)
 420 /* struct vnop_advlock_args {
 421         struct vnodeop_desc *a_desc;
 422         vnode_t a_vp;
 423         caddr_t a_id;
 424         int a_op;
 425         struct flock *a_fl;
 426         int a_flags;
 427         vfs_context_t a_context;
 428 }; */
 429 {
 430         LOCKD_MSG_REQUEST msgreq;
 431         LOCKD_MSG *msg;
 432         vnode_t vp, wvp;
 433         struct nfsnode *np;
 434         int error, error1;
 435         struct flock *fl;
 436         int fmode, ioflg;
 437         struct nfsmount *nmp;
 438         struct nfs_vattr nvattr;
 439         off_t start, end;
 440         struct timeval now;
 441         int timeo, endtime, lastmsg, wentdown = 0;
 442         int lockpidcheck;
 443         kauth_cred_t cred;
 444         proc_t p;
 445         struct sockaddr *saddr;
 446
 447         p = vfs_context_proc(ap->a_context);
 448         cred = vfs_context_ucred(ap->a_context);
 449
 450         vp = ap->a_vp;
 451         fl = ap->a_fl;
 452         np = VTONFS(vp);
 453
 454         nmp = VFSTONFS(vnode_mount(vp));
 455         if (!nmp)
 456                 return (ENXIO);
 457         if (nmp->nm_flag & NFSMNT_NOLOCKS)
 458                 return (ENOTSUP);
 459
 460         /*
 461          * The NLM protocol doesn't allow the server to return an error
 462          * on ranges, so we do it.  Pre LFS (Large File Summit)
 463          * standards required EINVAL for the range errors.  More recent
 464          * standards use EOVERFLOW, but their EINVAL wording still
 465          * encompasses these errors.
 466          * Any code sensitive to this is either:
 467          *  1) written pre-LFS and so can handle only EINVAL, or
 468          *  2) written post-LFS and thus ought to be tolerant of pre-LFS
 469          *     implementations.
 470          * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
 471          */
 472         if (fl->l_whence != SEEK_END) {
 473                 if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
 474                     fl->l_start < 0 ||
 475                     (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
 476                     (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
 477                         return (EINVAL);
 478         }
 479         /*
 480          * If daemon is running take a ref on its fifo vnode
 481          */
 482         if (!(wvp = nfslockdvnode)) {
 483                 if (!nfslockdwaiting && !nfslockdstarttimeout)
 484                         return (ENOTSUP);
 485                 /*
 486                  * Don't wake lock daemon if it hasn't been started yet and
 487                  * this is an unlock request (since we couldn't possibly
 488                  * actually have a lock on the file).  This could be an
 489                  * uninformed unlock request due to closef()'s behavior of doing
 490                  * unlocks on all files if a process has had a lock on ANY file.
 491                  */
 492                 if (!nfslockdvnode && (fl->l_type == F_UNLCK))
 493                         return (EINVAL);
 494                 microuptime(&now);
 495                 if (nfslockdwaiting) {
 496                         /* wake up lock daemon */
 497                         nfslockdstarttimeout = now.tv_sec + 60;
 498                         (void)wakeup((void *)&nfslockdwaiting);
 499                 }
 500                 /* wait on nfslockdvnode for a while to allow daemon to start */
 501                 while (!nfslockdvnode && (now.tv_sec < nfslockdstarttimeout)) {
 502                         error = tsleep((void *)&nfslockdvnode, PCATCH | PUSER, "lockdstart", 2*hz);
 503                         if (error && (error != EWOULDBLOCK))
 504                                 return (error);
 505                         /* check that we still have our mount... */
 506                         /* ...and that we still support locks */
 507                         nmp = VFSTONFS(vnode_mount(vp));
 508                         if (!nmp)
 509                                 return (ENXIO);
 510                         if (nmp->nm_flag & NFSMNT_NOLOCKS)
 511                                 return (ENOTSUP);
 512                         if (!error)
 513                                 break;
 514                         microuptime(&now);
 515                 }
 516                 /*
 517                  * check for nfslockdvnode
 518                  * If it hasn't started by now, there's a problem.
 519                  */
 520                 if (!(wvp = nfslockdvnode))
 521                         return (ENOTSUP);
 522         }
 523         error = vnode_getwithref(wvp);
 524         if (error)
 525                 return (ENOTSUP);
 526         error = vnode_ref(wvp);
 527         if (error) {
 528                 vnode_put(wvp);
 529                 return (ENOTSUP);
 530         }
 531
 532         /*
 533          * Need to check if this process has successfully acquired an NFS lock before.
 534          * If not, and this is an unlock request we can simply return success here.
 535          */
 536         lockpidcheck = nfs_lock_pid_check(p, 0, vp);
 537         if (lockpidcheck) {
 538                 if (lockpidcheck != ENOENT) {
 539                         vnode_rele(wvp);
 540                         vnode_put(wvp);
 541                         return (lockpidcheck);
 542                 }
 543                 if (ap->a_op == F_UNLCK) {
 544                         vnode_rele(wvp);
 545                         vnode_put(wvp);
 546                         return (0);
 547                 }
 548         }
 549
 550         /*
 551          * The NFS Lock Manager protocol doesn't directly handle
 552          * negative lengths or SEEK_END, so we need to normalize
 553          * things here where we have all the info.
 554          * (Note: SEEK_CUR is already adjusted for at this point)
 555          */
 556         /* Convert the flock structure into a start and end. */
 557         switch (fl->l_whence) {
 558         case SEEK_SET:
 559         case SEEK_CUR:
 560                 /*
 561                  * Caller is responsible for adding any necessary offset
 562                  * to fl->l_start when SEEK_CUR is used.
 563                  */
 564                 start = fl->l_start;
 565                 break;
 566         case SEEK_END:
 567                 /* need to flush, and refetch attributes to make */
 568                 /* sure we have the correct end of file offset   */
 569                 if (np->n_flag & NMODIFIED) {
 570                         NATTRINVALIDATE(np);
 571                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 572                         if (error) {
 573                                 vnode_rele(wvp);
 574                                 vnode_put(wvp);
 575                                 return (error);
 576                         }
 577                 }
 578                 NATTRINVALIDATE(np);
 579
 580                 error = nfs_getattr(vp, &nvattr, cred, p);
 581                 if (error) {
 582                         vnode_rele(wvp);
 583                         vnode_put(wvp);
 584                         return (error);
 585                 }
 586                 start = np->n_size + fl->l_start;
 587                 break;
 588         default:
 589                 vnode_rele(wvp);
 590                 vnode_put(wvp);
 591                 return (EINVAL);
 592         }
 593         if (fl->l_len == 0)
 594                 end = -1;
 595         else if (fl->l_len > 0)
 596                 end = start + fl->l_len - 1;
 597         else { /* l_len is negative */
 598                 end = start - 1;
 599                 start += fl->l_len;
 600         }
 601         if (start < 0) {
 602                 vnode_rele(wvp);
 603                 vnode_put(wvp);
 604                 return (EINVAL);
 605         }
 606         if (!NFS_ISV3(vp) &&
 607             ((start >= 0x80000000) || (end >= 0x80000000))) {
 608                 vnode_rele(wvp);
 609                 vnode_put(wvp);
 610                 return (EINVAL);
 611         }
 612
 613         /*
 614          * Fill in the information structure.
 615          */
 616         msgreq.lmr_answered = 0;
 617         msgreq.lmr_errno = 0;
 618         msgreq.lmr_saved_errno = 0;
 619         msg = &msgreq.lmr_msg;
 620         msg->lm_version = LOCKD_MSG_VERSION;
 621         msg->lm_flags = 0;
 622
 623         msg->lm_fl = *fl;
 624         msg->lm_fl.l_start = start;
 625         if (end != -1)
 626                 msg->lm_fl.l_len = end - start + 1;
 627         msg->lm_fl.l_pid = proc_pid(p);
 628
 629         if (ap->a_flags & F_WAIT)
 630                 msg->lm_flags |= LOCKD_MSG_BLOCK;
 631         if (ap->a_op == F_GETLK)
 632                 msg->lm_flags |= LOCKD_MSG_TEST;
 633
 634         nmp = VFSTONFS(vnode_mount(vp));
 635         if (!nmp) {
 636                 vnode_rele(wvp);
 637                 vnode_put(wvp);
 638                 return (ENXIO);
 639         }
 640
 641         saddr = mbuf_data(nmp->nm_nam);
 642         bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len));
 643         msg->lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
 644         bcopy(VTONFS(vp)->n_fhp, msg->lm_fh, msg->lm_fh_len);
 645         if (NFS_ISV3(vp))
 646                 msg->lm_flags |= LOCKD_MSG_NFSV3;
 647         cru2x(cred, &msg->lm_cred);
 648
 649         microuptime(&now);
 650         lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
 651
 652         fmode = FFLAGS(O_WRONLY);
 653         if ((error = VNOP_OPEN(wvp, fmode, ap->a_context))) {
 654                 vnode_rele(wvp);
 655                 vnode_put(wvp);
 656                 return (error);
 657         }
 658         vnode_lock(wvp);
 659         ++wvp->v_writecount;
 660         vnode_unlock(wvp);
 661
 662         /* allocate unique xid */
 663         msg->lm_xid = nfs_lockxid_get();
 664         nfs_lockdmsg_enqueue(&msgreq);
 665
 666         timeo = 2*hz;
 667 #define IO_NOMACCHECK 0;
 668         ioflg = IO_UNIT | IO_NOMACCHECK;
 669         for (;;) {
 670                 error = 0;
 671                 while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) {
 672                         nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT;
 673                         error = tsleep((void *)&nfslockdfifolock,
 674                                         PCATCH | PUSER, "lockdfifo", 20*hz);
 675                         if (error)
 676                                 break;
 677                 }
 678                 if (error)
 679                         break;
 680                 nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED;
 681
 682                 error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)msg, sizeof(*msg), 0,
 683                     UIO_SYSSPACE32, ioflg, proc_ucred(kernproc), NULL, p);
 684
 685                 nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED;
 686                 if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) {
 687                         nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT;
 688                         wakeup((void *)&nfslockdfifolock);
 689                 }
 690
 691                 if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) {
 692                         break;
 693                 }
 694
 695                 /*
 696                  * Always wait for an answer.  Not waiting for unlocks could
 697                  * cause a lock to be left if the unlock request gets dropped.
 698                  */
 699
 700                 /*
 701                  * Retry if it takes too long to get a response.
 702                  *
 703                  * The timeout numbers were picked out of thin air... they start
 704                  * at 2 and double each timeout with a max of 60 seconds.
 705                  *
 706                  * In order to maintain responsiveness, we pass a small timeout
 707                  * to tsleep and calculate the timeouts ourselves.  This allows
 708                  * us to pick up on mount changes quicker.
 709                  */
 710 wait_for_granted:
 711                 error = EWOULDBLOCK;
 712                 microuptime(&now);
 713                 if ((timeo/hz) > 0)
 714                         endtime = now.tv_sec + timeo/hz;
 715                 else
 716                         endtime = now.tv_sec + 1;
 717                 while (now.tv_sec < endtime) {
 718                         error = tsleep((void *)&msgreq, PCATCH | PUSER, "lockd", 2*hz);
 719                         if (msgreq.lmr_answered) {
 720                                 /*
 721                                  * Note: it's possible to have a lock granted at
 722                                  * essentially the same time that we get interrupted.
 723                                  * Since the lock may be granted, we can't return an
 724                                  * error from this request or we might not unlock the
 725                                  * lock that's been granted.
 726                                  */
 727                                 error = 0;
 728                                 break;
 729                         }
 730                         if (error != EWOULDBLOCK)
 731                                 break;
 732                         /* check that we still have our mount... */
 733                         /* ...and that we still support locks */
 734                         nmp = VFSTONFS(vnode_mount(vp));
 735                         if (!nmp || (nmp->nm_flag & NFSMNT_NOLOCKS))
 736                                 break;
 737                         /*
 738                          * If the mount is hung and we've requested not to hang
 739                          * on remote filesystems, then bail now.
 740                          */
 741                         if ((p != NULL) && ((proc_noremotehang(p)) != 0) &&
 742                             ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
 743                                 if (fl->l_type == F_UNLCK)
 744                                         printf("nfs_dolock: aborting unlock request "
 745                                             "due to timeout (noremotehang)\n");
 746                                 error = EIO;
 747                                 break;
 748                         }
 749                         microuptime(&now);
 750                 }
 751                 if (error) {
 752                         /* check that we still have our mount... */
 753                         nmp = VFSTONFS(vnode_mount(vp));
 754                         if (!nmp) {
 755                                 if (error == EWOULDBLOCK)
 756                                         error = ENXIO;
 757                                 break;
 758                         }
 759                         /* ...and that we still support locks */
 760                         if (nmp->nm_flag & NFSMNT_NOLOCKS) {
 761                                 if (error == EWOULDBLOCK)
 762                                         error = ENOTSUP;
 763                                 break;
 764                         }
 765                         if ((error == ENOTSUP) &&
 766                             (nmp->nm_state & NFSSTA_LOCKSWORK)) {
 767                                 /*
 768                                  * We have evidence that locks work, yet lockd
 769                                  * returned ENOTSUP.  This is probably because
 770                                  * it was unable to contact the server's lockd to
 771                                  * send it the request.
 772                                  *
 773                                  * Because we know locks work, we'll consider
 774                                  * this failure to be a timeout.
 775                                  */
 776                                 error = EWOULDBLOCK;
 777                         }
 778                         if (error != EWOULDBLOCK) {
 779                                 /*
 780                                  * We're going to bail on this request.
 781                                  * If we were a blocked lock request, send a cancel.
 782                                  */
 783                                 if ((msgreq.lmr_errno == EINPROGRESS) &&
 784                                     !(msg->lm_flags & LOCKD_MSG_CANCEL)) {
 785                                         /* set this request up as a cancel */
 786                                         msg->lm_flags |= LOCKD_MSG_CANCEL;
 787                                         nfs_lockdmsg_dequeue(&msgreq);
 788                                         msg->lm_xid = nfs_lockxid_get();
 789                                         nfs_lockdmsg_enqueue(&msgreq);
 790                                         msgreq.lmr_saved_errno = error;
 791                                         msgreq.lmr_errno = 0;
 792                                         msgreq.lmr_answered = 0;
 793                                         /* reset timeout */
 794                                         timeo = 2*hz;
 795                                         /* send cancel request */
 796                                         continue;
 797                                 }
 798                                 break;
 799                         }
 800
 801                         /*
 802                          * If the mount is hung and we've requested not to hang
 803                          * on remote filesystems, then bail now.
 804                          */
 805                         if ((p != NULL) && ((proc_noremotehang(p)) != 0) &&
 806                             ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
 807                                 if (fl->l_type == F_UNLCK)
 808                                         printf("nfs_dolock: aborting unlock request "
 809                                             "due to timeout (noremotehang)\n");
 810                                 error = EIO;
 811                                 break;
 812                         }
 813                         /* warn if we're not getting any response */
 814                         microuptime(&now);
 815                         if ((msgreq.lmr_errno != EINPROGRESS) &&
 816                             (nmp->nm_tprintf_initial_delay != 0) &&
 817                             ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
 818                                 lastmsg = now.tv_sec;
 819                                 nfs_down(nmp, p, 0, NFSSTA_LOCKTIMEO, "lockd not responding");
 820                                 wentdown = 1;
 821                         }
 822                         if (msgreq.lmr_errno == EINPROGRESS) {
 823                                 /*
 824                                  * We've got a blocked lock request that we are
 825                                  * going to retry.  First, we'll want to try to
 826                                  * send a cancel for the previous request.
 827                                  *
 828                                  * Clear errno so if we don't get a response
 829                                  * to the resend we'll call nfs_down().
 830                                  * Also reset timeout because we'll expect a
 831                                  * quick response to the cancel/resend (even if
 832                                  * it is NLM_BLOCKED).
 833                                  */
 834                                 msg->lm_flags |= LOCKD_MSG_CANCEL;
 835                                 nfs_lockdmsg_dequeue(&msgreq);
 836                                 msg->lm_xid = nfs_lockxid_get();
 837                                 nfs_lockdmsg_enqueue(&msgreq);
 838                                 msgreq.lmr_saved_errno = msgreq.lmr_errno;
 839                                 msgreq.lmr_errno = 0;
 840                                 msgreq.lmr_answered = 0;
 841                                 timeo = 2*hz;
 842                                 /* send cancel then resend request */
 843                                 continue;
 844                         }
 845                         /*
 846                          * We timed out, so we will rewrite the request
 847                          * to the fifo, but only if it isn't already full.
 848                          */
 849                         ioflg |= IO_NDELAY;
 850                         timeo *= 2;
 851                         if (timeo > 60*hz)
 852                                 timeo = 60*hz;
 853                         /* resend request */
 854                         continue;
 855                 }
 856
 857                 /* we got a reponse, so the server's lockd is OK */
 858                 nfs_up(VFSTONFS(vnode_mount(vp)), p, NFSSTA_LOCKTIMEO,
 859                         wentdown ? "lockd alive again" : NULL);
 860                 wentdown = 0;
 861
 862                 if (msgreq.lmr_errno == EINPROGRESS) {
 863                         /* got NLM_BLOCKED response */
 864                         /* need to wait for NLM_GRANTED */
 865                         timeo = 60*hz;
 866                         msgreq.lmr_answered = 0;
 867                         goto wait_for_granted;
 868                 }
 869
 870                 if ((msg->lm_flags & LOCKD_MSG_CANCEL) &&
 871                     (msgreq.lmr_saved_errno == EINPROGRESS)) {
 872                         /*
 873                          * We just got a successful reply to the
 874                          * cancel of the previous blocked lock request.
 875                          * Now, go ahead and resend the request.
 876                          */
 877                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 878                         nfs_lockdmsg_dequeue(&msgreq);
 879                         msg->lm_xid = nfs_lockxid_get();
 880                         nfs_lockdmsg_enqueue(&msgreq);
 881                         msgreq.lmr_saved_errno = 0;
 882                         msgreq.lmr_errno = 0;
 883                         msgreq.lmr_answered = 0;
 884                         timeo = 2*hz;
 885                         /* resend request */
 886                         continue;
 887                 }
 888
 889                 if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) {
 890                         if (msg->lm_fl.l_type != F_UNLCK) {
 891                                 fl->l_type = msg->lm_fl.l_type;
 892                                 fl->l_pid = msg->lm_fl.l_pid;
 893                                 fl->l_start = msg->lm_fl.l_start;
 894                                 fl->l_len = msg->lm_fl.l_len;
 895                                 fl->l_whence = SEEK_SET;
 896                         } else {
 897                                 fl->l_type = F_UNLCK;
 898                         }
 899                 }
 900
 901                 /*
 902                  * If the blocked lock request was cancelled.
 903                  * Restore the error condition from when we
 904                  * originally bailed on the request.
 905                  */
 906                 if (msg->lm_flags & LOCKD_MSG_CANCEL) {
 907                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 908                         error = msgreq.lmr_saved_errno;
 909                 } else
 910                         error = msgreq.lmr_errno;
 911
 912                 if (!error) {
 913                         /* record that NFS file locking has worked on this mount */
 914                         nmp = VFSTONFS(vnode_mount(vp));
 915                         if (nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK))
 916                                 nmp->nm_state |= NFSSTA_LOCKSWORK;
 917                         /*
 918                          * If we successfully acquired a lock, make sure this pid
 919                          * is in the nfs_lock_pid hash table so we know we can't
 920                          * short-circuit unlock requests.
 921                          */
 922                         if ((lockpidcheck == ENOENT) &&
 923                             ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW)))
 924                                 nfs_lock_pid_check(p, 1, vp);
 925
 926                 }
 927                 break;
 928         }
 929
 930         nfs_lockdmsg_dequeue(&msgreq);
 931
 932         error1 = VNOP_CLOSE(wvp, FWRITE, ap->a_context);
 933         vnode_rele(wvp);
 934         vnode_put(wvp);
 935         /* prefer any previous 'error' to our vn_close 'error1'. */
 936         return (error != 0 ? error : error1);
 937 }
 938
 939 /*
 940  * nfslockdans --
 941  *      NFS advisory byte-level locks answer from the lock daemon.
 942  */
 943 int
 944 nfslockdans(proc_t p, struct lockd_ans *ansp)
 945 {
 946         LOCKD_MSG_REQUEST *msgreq;
 947         int error;
 948
 949         /* Let root make this call. */
 950         error = proc_suser(p);
 951         if (error)
 952                 return (error);
 953
 954         /* the version should match, or we're out of sync */
 955         if (ansp->la_version != LOCKD_ANS_VERSION)
 956                 return (EINVAL);
 957
 958         /* try to find the lockd message by transaction id (cookie) */
 959         msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
 960         if (ansp->la_flags & LOCKD_ANS_GRANTED) {
 961                 /*
 962                  * We can't depend on the granted message having our cookie,
 963                  * so we check the answer against the lockd message found.
 964                  * If no message was found or it doesn't match the answer,
 965                  * we look for the lockd message by the answer's lock info.
 966                  */
 967                 if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp))
 968                         msgreq = nfs_lockdmsg_find_by_answer(ansp);
 969                 /*
 970                  * We need to make sure this request isn't being cancelled
 971                  * If it is, we don't want to accept the granted message.
 972                  */
 973                 if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL))
 974                         msgreq = NULL;
 975         }
 976         if (!msgreq)
 977                 return (EPIPE);
 978
 979         msgreq->lmr_errno = ansp->la_errno;
 980         if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) {
 981                 if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) {
 982                         if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL)
 983                                 msgreq->lmr_msg.lm_fl.l_type = F_WRLCK;
 984                         else
 985                                 msgreq->lmr_msg.lm_fl.l_type = F_RDLCK;
 986                         msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid;
 987                         msgreq->lmr_msg.lm_fl.l_start = ansp->la_start;
 988                         msgreq->lmr_msg.lm_fl.l_len = ansp->la_len;
 989                 } else {
 990                         msgreq->lmr_msg.lm_fl.l_type = F_UNLCK;
 991                 }
 992         }
 993
 994         msgreq->lmr_answered = 1;
 995         (void)wakeup((void *)msgreq);
 996
 997         return (0);
 998 }
 999
1000 /*
1001  * nfslockdfd --
1002  *      NFS advisory byte-level locks: fifo file# from the lock daemon.
1003  */
1004 int
1005 nfslockdfd(proc_t p, int fd)
1006 {
1007         int error;
1008         vnode_t vp, oldvp;
1009
1010         error = proc_suser(p);
1011         if (error)
1012                 return (error);
1013         if (fd < 0) {
1014                 vp = NULL;
1015         } else {
1016                 error = file_vnode(fd, &vp);
1017                 if (error)
1018                         return (error);
1019                 error = vnode_getwithref(vp);
1020                 if (error)
1021                         return (error);
1022                 error = vnode_ref(vp);
1023                 if (error) {
1024                         vnode_put(vp);
1025                         return (error);
1026                 }
1027         }
1028         oldvp = nfslockdvnode;
1029         nfslockdvnode = vp;
1030         if (oldvp) {
1031                 vnode_rele(oldvp);
1032         }
1033         (void)wakeup((void *)&nfslockdvnode);
1034         if (vp) {
1035                 vnode_put(vp);
1036         }
1037         return (0);
1038 }
1039
1040 /*
1041  * nfslockdwait --
1042  *      lock daemon waiting for lock request
1043  */
1044 int
1045 nfslockdwait(proc_t p)
1046 {
1047         int error;
1048
1049         error = proc_suser(p);
1050         if (error)
1051                 return (error);
1052         if (nfslockdwaiting || nfslockdvnode)
1053                 return (EBUSY);
1054
1055         nfslockdstarttimeout = 0;
1056         nfslockdwaiting = 1;
1057         tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0);
1058         nfslockdwaiting = 0;
1059
1060         return (0);
1061 }