bsd/nfs/nfs_lock.c

   1 /*
   2  * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*-
  23  * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
  24  *
  25  * Redistribution and use in source and binary forms, with or without
  26  * modification, are permitted provided that the following conditions
  27  * are met:
  28  * 1. Redistributions of source code must retain the above copyright
  29  *    notice, this list of conditions and the following disclaimer.
  30  * 2. Redistributions in binary form must reproduce the above copyright
  31  *    notice, this list of conditions and the following disclaimer in the
  32  *    documentation and/or other materials provided with the distribution.
  33  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  34  *    promote products derived from this software without specific prior
  35  *    written permission.
  36  *
  37  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  38  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  39  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  40  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  41  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  42  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  43  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  44  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  45  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  46  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47  * SUCH DAMAGE.
  48  *
  49  *      from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
  50  */
  51
  52 #include <sys/cdefs.h>
  53 #include <sys/param.h>
  54 #include <sys/systm.h>
  55 #include <sys/fcntl.h>
  56 #include <sys/kernel.h>         /* for hz */
  57 #include <sys/file.h>
  58 #include <sys/lock.h>
  59 #include <sys/malloc.h>
  60 #include <sys/lockf.h>          /* for hz */ /* Must come after sys/malloc.h */
  61 #include <sys/mbuf.h>
  62 #include <sys/mount.h>
  63 #include <sys/namei.h>
  64 #include <sys/proc.h>
  65 #include <sys/resourcevar.h>
  66 #include <sys/socket.h>
  67 #include <sys/socket.h>
  68 #include <sys/unistd.h>
  69 #include <sys/user.h>
  70 #include <sys/vnode.h>
  71
  72 #include <kern/thread_act.h>
  73
  74 #include <machine/limits.h>
  75
  76 #include <net/if.h>
  77
  78 #include <nfs/rpcv2.h>
  79 #include <nfs/nfsproto.h>
  80 #include <nfs/nfs.h>
  81 #include <nfs/nfsmount.h>
  82 #include <nfs/nfsnode.h>
  83 #include <nfs/nfs_lock.h>
  84
  85 #define OFF_MAX QUAD_MAX
  86
  87 uint64_t nfsadvlocks = 0;
  88 struct timeval nfsadvlock_longest = {0, 0};
  89 struct timeval nfsadvlocks_time = {0, 0};
  90
  91 /*
  92  * globals for managing the lockd fifo
  93  */
  94 pid_t nfslockdpid = 0;
  95 struct file *nfslockdfp = 0;
  96 int nfslockdwaiting = 0;
  97 int nfslockdfifowritten = 0;
  98 int nfslockdfifolock = 0;
  99 #define NFSLOCKDFIFOLOCK_LOCKED 1
 100 #define NFSLOCKDFIFOLOCK_WANT   2
 101
 102 /*
 103  * pending lock request messages are kept in this queue which is
 104  * kept sorted by transaction ID (xid).
 105  */
 106 uint64_t nfs_lockxid = 0;
 107 LOCKD_MSG_QUEUE nfs_pendlockq;
 108
 109 /*
 110  * This structure is used to identify processes which have acquired NFS locks.
 111  * Knowing which processes have ever acquired locks allows us to short-circuit
 112  * unlock requests for processes that have never had an NFS file lock.  Thus
 113  * avoiding a costly and unnecessary lockd request.
 114  */
 115 struct nfs_lock_pid {
 116         TAILQ_ENTRY(nfs_lock_pid)       lp_lru;         /* LRU list */
 117         LIST_ENTRY(nfs_lock_pid)        lp_hash;        /* hash chain */
 118         int                             lp_valid;       /* valid entry? */
 119         int                             lp_time;        /* last time seen valid */
 120         pid_t                           lp_pid;         /* The process ID. */
 121         struct timeval                  lp_pid_start;   /* Start time of process id */
 122 };
 123
 124 #define NFS_LOCK_PID_HASH_SIZE          64      // XXX tune me
 125 #define NFS_LOCK_PID_HASH(pid)  \
 126         (&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash])
 127 LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl;
 128 TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru;
 129 u_long nfs_lock_pid_hash;
 130 int nfs_lock_pid_lock;
 131
 132
 133 /*
 134  * initialize global nfs lock state
 135  */
 136 void
 137 nfs_lockinit(void)
 138 {
 139         TAILQ_INIT(&nfs_pendlockq);
 140         nfs_lock_pid_lock = 0;
 141         nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE,
 142                                          M_TEMP, &nfs_lock_pid_hash);
 143         TAILQ_INIT(&nfs_lock_pid_lru);
 144 }
 145
 146 /*
 147  * insert a lock request message into the pending queue
 148  */
 149 static inline void
 150 nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq)
 151 {
 152         LOCKD_MSG_REQUEST *mr;
 153
 154         mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue);
 155         if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 156                 /* fast path: empty queue or new largest xid */
 157                 TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next);
 158                 return;
 159         }
 160         /* slow path: need to walk list to find insertion point */
 161         while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 162                 mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next);
 163         }
 164         if (mr) {
 165                 TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next);
 166         } else {
 167                 TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next);
 168         }
 169 }
 170
 171 /*
 172  * remove a lock request message from the pending queue
 173  */
 174 static inline void
 175 nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq)
 176 {
 177         TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next);
 178 }
 179
 180 /*
 181  * find a pending lock request message by xid
 182  *
 183  * We search from the head of the list assuming that the message we're
 184  * looking for is for an older request (because we have an answer to it).
 185  * This assumes that lock request will be answered primarily in FIFO order.
 186  * However, this may not be the case if there are blocked requests.  We may
 187  * want to move blocked requests to a separate queue (but that'll complicate
 188  * duplicate xid checking).
 189  */
 190 static inline LOCKD_MSG_REQUEST *
 191 nfs_lockdmsg_find_by_xid(uint64_t lockxid)
 192 {
 193         LOCKD_MSG_REQUEST *mr;
 194
 195         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 196                 if (mr->lmr_msg.lm_xid == lockxid)
 197                         return mr;
 198                 if (mr->lmr_msg.lm_xid > lockxid)
 199                         return NULL;
 200         }
 201         return mr;
 202 }
 203
 204 /*
 205  * Because we can't depend on nlm_granted messages containing the same
 206  * cookie we sent with the original lock request, we need code test if
 207  * an nlm_granted answer matches the lock request.  We also need code
 208  * that can find a lockd message based solely on the nlm_granted answer.
 209  */
 210
 211 /*
 212  * compare lockd message to answer
 213  *
 214  * returns 0 on equality and 1 if different
 215  */
 216 static inline int
 217 nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp)
 218 {
 219         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 220                 return 1;
 221         if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid)
 222                 return 1;
 223         if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start)
 224                 return 1;
 225         if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len)
 226                 return 1;
 227         if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len)
 228                 return 1;
 229         if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len))
 230                 return 1;
 231         return 0;
 232 }
 233
 234 /*
 235  * find a pending lock request message based on the lock info provided
 236  * in the lockd_ans/nlm_granted data.  We need this because we can't
 237  * depend on nlm_granted messages containing the same cookie we sent
 238  * with the original lock request.
 239  *
 240  * We search from the head of the list assuming that the message we're
 241  * looking for is for an older request (because we have an answer to it).
 242  * This assumes that lock request will be answered primarily in FIFO order.
 243  * However, this may not be the case if there are blocked requests.  We may
 244  * want to move blocked requests to a separate queue (but that'll complicate
 245  * duplicate xid checking).
 246  */
 247 static inline LOCKD_MSG_REQUEST *
 248 nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp)
 249 {
 250         LOCKD_MSG_REQUEST *mr;
 251
 252         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 253                 return NULL;
 254         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 255                 if (!nfs_lockdmsg_compare_to_answer(mr, ansp))
 256                         break;
 257         }
 258         return mr;
 259 }
 260
 261 /*
 262  * return the next unique lock request transaction ID
 263  */
 264 static inline uint64_t
 265 nfs_lockxid_get(void)
 266 {
 267         LOCKD_MSG_REQUEST *mr;
 268
 269         /* derive initial lock xid from system time */
 270         if (!nfs_lockxid) {
 271                 /*
 272                  * Note: it's OK if this code inits nfs_lockxid to 0 (for example,
 273                  * due to a broken clock) because we immediately increment it
 274                  * and we guarantee to never use xid 0.  So, nfs_lockxid should only
 275                  * ever be 0 the first time this function is called.
 276                  */
 277                 struct timeval tv;
 278                 microtime(&tv);
 279                 nfs_lockxid = (uint64_t)tv.tv_sec << 12;
 280         }
 281
 282         /* make sure we get a unique xid */
 283         do {
 284                 /* Skip zero xid if it should ever happen.  */
 285                 if (++nfs_lockxid == 0)
 286                         nfs_lockxid++;
 287                 if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) ||
 288                      (mr->lmr_msg.lm_xid < nfs_lockxid)) {
 289                         /* fast path: empty queue or new largest xid */
 290                         break;
 291                 }
 292                 /* check if xid is already in use */
 293         } while (nfs_lockdmsg_find_by_xid(nfs_lockxid));
 294
 295         return nfs_lockxid;
 296 }
 297
 298
 299 /*
 300  * Check the nfs_lock_pid hash table for an entry and, if requested,
 301  * add the entry if it is not found.
 302  *
 303  * (Also, if adding, try to clean up some stale entries.)
 304  */
 305 static int
 306 nfs_lock_pid_check(struct proc *p, int addflag, struct vnode *vp)
 307 {
 308         struct nfs_lock_pid *lp, *lplru, *lplru_next;
 309         struct proc *plru;
 310         int error = 0;
 311         struct timeval now;
 312
 313         /* lock hash */
 314 loop:
 315         if (nfs_lock_pid_lock) {
 316                 while (nfs_lock_pid_lock) {
 317                         nfs_lock_pid_lock = -1;
 318                         tsleep(&nfs_lock_pid_lock, PCATCH, "nfslockpid", 0);
 319                         if ((error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)))
 320                                 return (error);
 321                 }
 322                 goto loop;
 323         }
 324         nfs_lock_pid_lock = 1;
 325
 326         /* Search hash chain */
 327         error = ENOENT;
 328         lp = NFS_LOCK_PID_HASH(p->p_pid)->lh_first;
 329         for (; lp != NULL; lp = lp->lp_hash.le_next)
 330                 if (lp->lp_pid == p->p_pid) {
 331                         /* found pid... */
 332                         if (timevalcmp(&lp->lp_pid_start, &p->p_stats->p_start, ==)) {
 333                                 /* ...and it's valid */
 334                                 /* move to tail of LRU */
 335                                 TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 336                                 microuptime(&now);
 337                                 lp->lp_time = now.tv_sec;
 338                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 339                                 error = 0;
 340                                 break;
 341                         }
 342                         /* ...but it's no longer valid */
 343                         /* remove from hash, invalidate, and move to lru head */
 344                         LIST_REMOVE(lp, lp_hash);
 345                         lp->lp_valid = 0;
 346                         TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 347                         TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru);
 348                         lp = NULL;
 349                         break;
 350                 }
 351
 352         /* if we didn't find it (valid) and we've been asked to add it */
 353         if ((error == ENOENT) && addflag) {
 354                 /* scan lru list for invalid, stale entries to reuse/free */
 355                 int lrucnt = 0;
 356                 microuptime(&now);
 357                 for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) {
 358                         lplru_next = TAILQ_NEXT(lplru, lp_lru);
 359                         if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) {
 360                                 /*
 361                                  * If the oldest LRU entry is relatively new, then don't
 362                                  * bother scanning any further.
 363                                  */
 364                                 break;
 365                         }
 366                         /* remove entry from LRU, and check if it's still in use */
 367                         TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru);
 368                         if (!lplru->lp_valid || !(plru = pfind(lplru->lp_pid)) ||
 369                             timevalcmp(&lplru->lp_pid_start, &plru->p_stats->p_start, !=)) {
 370                                 /* no longer in use */
 371                                 LIST_REMOVE(lplru, lp_hash);
 372                                 if (!lp) {
 373                                         /* we'll reuse this one */
 374                                         lp = lplru;
 375                                 } else {
 376                                         /* we can free this one */
 377                                         FREE(lplru, M_TEMP);
 378                                 }
 379                         } else {
 380                                 /* still in use */
 381                                 lplru->lp_time = now.tv_sec;
 382                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru);
 383                         }
 384                         /* don't check too many entries at once */
 385                         if (++lrucnt > 8)
 386                                 break;
 387                 }
 388                 if (!lp) {
 389                         /* we need to allocate a new one */
 390                         MALLOC(lp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid),
 391                                 M_TEMP, M_WAITOK | M_ZERO);
 392                 }
 393                 /* (re)initialize nfs_lock_pid info */
 394                 lp->lp_pid = p->p_pid;
 395                 lp->lp_pid_start = p->p_stats->p_start;
 396                 /* insert pid in hash */
 397                 LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash);
 398                 lp->lp_valid = 1;
 399                 lp->lp_time = now.tv_sec;
 400                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 401                 error = 0;
 402         }
 403
 404         /* unlock hash */
 405         if (nfs_lock_pid_lock < 0) {
 406                 nfs_lock_pid_lock = 0;
 407                 wakeup(&nfs_lock_pid_lock);
 408         } else
 409                 nfs_lock_pid_lock = 0;
 410
 411         return (error);
 412 }
 413
 414
 415 /*
 416  * nfs_advlock --
 417  *      NFS advisory byte-level locks.
 418  */
 419 int
 420 nfs_dolock(struct vop_advlock_args *ap)
 421 /* struct vop_advlock_args {
 422         struct vnodeop_desc *a_desc;
 423         struct vnode *a_vp;
 424         caddr_t a_id;
 425         int a_op;
 426         struct flock *a_fl;
 427         int a_flags;
 428 }; */
 429 {
 430         LOCKD_MSG_REQUEST msgreq;
 431         LOCKD_MSG *msg;
 432         struct vnode *vp, *wvp;
 433         struct nfsnode *np;
 434         int error, error1;
 435         struct flock *fl;
 436         int fmode, ioflg;
 437         struct proc *p;
 438         struct nfsmount *nmp;
 439         struct vattr vattr;
 440         off_t start, end;
 441         struct timeval now;
 442         int timeo, endtime, lastmsg, wentdown = 0;
 443         int lockpidcheck;
 444
 445         p = current_proc();
 446
 447         vp = ap->a_vp;
 448         fl = ap->a_fl;
 449         np = VTONFS(vp);
 450
 451         nmp = VFSTONFS(vp->v_mount);
 452         if (!nmp)
 453                 return (ENXIO);
 454         if (nmp->nm_flag & NFSMNT_NOLOCKS)
 455                 return (EOPNOTSUPP);
 456
 457         /*
 458          * The NLM protocol doesn't allow the server to return an error
 459          * on ranges, so we do it.  Pre LFS (Large File Summit)
 460          * standards required EINVAL for the range errors.  More recent
 461          * standards use EOVERFLOW, but their EINVAL wording still
 462          * encompasses these errors.
 463          * Any code sensitive to this is either:
 464          *  1) written pre-LFS and so can handle only EINVAL, or
 465          *  2) written post-LFS and thus ought to be tolerant of pre-LFS
 466          *     implementations.
 467          * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
 468          */
 469         if (fl->l_whence != SEEK_END) {
 470                 if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
 471                     fl->l_start < 0 ||
 472                     (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
 473                     (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
 474                         return (EINVAL);
 475         }
 476         /*
 477          * If daemon is running take a ref on its fifo
 478          */
 479         if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) {
 480                 if (!nfslockdwaiting)
 481                         return (EOPNOTSUPP);
 482                 /*
 483                  * Don't wake lock daemon if it hasn't been started yet and
 484                  * this is an unlock request (since we couldn't possibly
 485                  * actually have a lock on the file).  This could be an
 486                  * uninformed unlock request due to closef()'s behavior of doing
 487                  * unlocks on all files if a process has had a lock on ANY file.
 488                  */
 489                 if (!nfslockdfp && (fl->l_type == F_UNLCK))
 490                         return (EINVAL);
 491                 /* wake up lock daemon */
 492                 (void)wakeup((void *)&nfslockdwaiting);
 493                 /* wait on nfslockdfp for a while to allow daemon to start */
 494                 tsleep((void *)&nfslockdfp, PCATCH | PUSER, "lockd", 60*hz);
 495                 /* check for nfslockdfp and f_data */
 496                 if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data))
 497                         return (EOPNOTSUPP);
 498         }
 499         VREF(wvp);
 500
 501         /*
 502          * Need to check if this process has successfully acquired an NFS lock before.
 503          * If not, and this is an unlock request we can simply return success here.
 504          */
 505         lockpidcheck = nfs_lock_pid_check(p, 0, vp);
 506         if (lockpidcheck) {
 507                 if (lockpidcheck != ENOENT)
 508                         return (lockpidcheck);
 509                 if (ap->a_op == F_UNLCK) {
 510                         vrele(wvp);
 511                         return (0);
 512                 }
 513         }
 514
 515         /*
 516          * The NFS Lock Manager protocol doesn't directly handle
 517          * negative lengths or SEEK_END, so we need to normalize
 518          * things here where we have all the info.
 519          * (Note: SEEK_CUR is already adjusted for at this point)
 520          */
 521         /* Convert the flock structure into a start and end. */
 522         switch (fl->l_whence) {
 523         case SEEK_SET:
 524         case SEEK_CUR:
 525                 /*
 526                  * Caller is responsible for adding any necessary offset
 527                  * to fl->l_start when SEEK_CUR is used.
 528                  */
 529                 start = fl->l_start;
 530                 break;
 531         case SEEK_END:
 532                 /* need to flush, and refetch attributes to make */
 533                 /* sure we have the correct end of file offset   */
 534                 if (np->n_flag & NMODIFIED) {
 535                         np->n_xid = 0;
 536                         error = nfs_vinvalbuf(vp, V_SAVE, p->p_ucred, p, 1);
 537                         if (error) {
 538                                 vrele(wvp);
 539                                 return (error);
 540                         }
 541                 }
 542                 np->n_xid = 0;
 543                 error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
 544                 if (error) {
 545                         vrele(wvp);
 546                         return (error);
 547                 }
 548                 start = np->n_size + fl->l_start;
 549                 break;
 550         default:
 551                 vrele(wvp);
 552                 return (EINVAL);
 553         }
 554         if (fl->l_len == 0)
 555                 end = -1;
 556         else if (fl->l_len > 0)
 557                 end = start + fl->l_len - 1;
 558         else { /* l_len is negative */
 559                 end = start - 1;
 560                 start += fl->l_len;
 561         }
 562         if (start < 0) {
 563                 vrele(wvp);
 564                 return (EINVAL);
 565         }
 566         if (!NFS_ISV3(vp) &&
 567             ((start >= 0x80000000) || (end >= 0x80000000))) {
 568                 vrele(wvp);
 569                 return (EINVAL);
 570         }
 571
 572         /*
 573          * Fill in the information structure.
 574          */
 575         msgreq.lmr_answered = 0;
 576         msgreq.lmr_errno = 0;
 577         msgreq.lmr_saved_errno = 0;
 578         msg = &msgreq.lmr_msg;
 579         msg->lm_version = LOCKD_MSG_VERSION;
 580         msg->lm_flags = 0;
 581
 582         msg->lm_fl = *fl;
 583         msg->lm_fl.l_start = start;
 584         if (end != -1)
 585                 msg->lm_fl.l_len = end - start + 1;
 586         msg->lm_fl.l_pid = p->p_pid;
 587
 588         if (ap->a_flags & F_WAIT)
 589                 msg->lm_flags |= LOCKD_MSG_BLOCK;
 590         if (ap->a_op == F_GETLK)
 591                 msg->lm_flags |= LOCKD_MSG_TEST;
 592
 593         nmp = VFSTONFS(vp->v_mount);
 594         if (!nmp) {
 595                 vrele(wvp);
 596                 return (ENXIO);
 597         }
 598
 599         bcopy(mtod(nmp->nm_nam, struct sockaddr *), &msg->lm_addr,
 600               min(sizeof msg->lm_addr,
 601                   mtod(nmp->nm_nam, struct sockaddr *)->sa_len));
 602         msg->lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
 603         bcopy(VTONFS(vp)->n_fhp, msg->lm_fh, msg->lm_fh_len);
 604         if (NFS_ISV3(vp))
 605                 msg->lm_flags |= LOCKD_MSG_NFSV3;
 606         cru2x(p->p_ucred, &msg->lm_cred);
 607
 608         microuptime(&now);
 609         lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
 610
 611         fmode = FFLAGS(O_WRONLY);
 612         if ((error = VOP_OPEN(wvp, fmode, kernproc->p_ucred, p))) {
 613                 vrele(wvp);
 614                 return (error);
 615         }
 616         ++wvp->v_writecount;
 617
 618         /* allocate unique xid */
 619         msg->lm_xid = nfs_lockxid_get();
 620         nfs_lockdmsg_enqueue(&msgreq);
 621
 622         timeo = 2*hz;
 623 #define IO_NOMACCHECK 0;
 624         ioflg = IO_UNIT | IO_NOMACCHECK;
 625         for (;;) {
 626                 VOP_LEASE(wvp, p, kernproc->p_ucred, LEASE_WRITE);
 627
 628                 error = 0;
 629                 while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) {
 630                         nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT;
 631                         error = tsleep((void *)&nfslockdfifolock,
 632                                         PCATCH | PUSER, "lockdfifo", 20*hz);
 633                         if (error)
 634                                 break;
 635                 }
 636                 if (error)
 637                         break;
 638                 nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED;
 639
 640                 error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)msg, sizeof(*msg), 0,
 641                     UIO_SYSSPACE, ioflg, kernproc->p_ucred, NULL, p);
 642
 643                 nfslockdfifowritten = 1;
 644
 645                 nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED;
 646                 if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) {
 647                         nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT;
 648                         wakeup((void *)&nfslockdfifolock);
 649                 }
 650                 /* wake up lock daemon */
 651                 if (nfslockdwaiting)
 652                         (void)wakeup((void *)&nfslockdwaiting);
 653
 654                 if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) {
 655                         break;
 656                 }
 657
 658                 /*
 659                  * Always wait for an answer.  Not waiting for unlocks could
 660                  * cause a lock to be left if the unlock request gets dropped.
 661                  */
 662
 663                 /*
 664                  * Retry if it takes too long to get a response.
 665                  *
 666                  * The timeout numbers were picked out of thin air... they start
 667                  * at 2 and double each timeout with a max of 60 seconds.
 668                  *
 669                  * In order to maintain responsiveness, we pass a small timeout
 670                  * to tsleep and calculate the timeouts ourselves.  This allows
 671                  * us to pick up on mount changes quicker.
 672                  */
 673 wait_for_granted:
 674                 error = EWOULDBLOCK;
 675                 microuptime(&now);
 676                 if ((timeo/hz) > 0)
 677                         endtime = now.tv_sec + timeo/hz;
 678                 else
 679                         endtime = now.tv_sec + 1;
 680                 while (now.tv_sec < endtime) {
 681                         error = tsleep((void *)&msgreq, PCATCH | PUSER, "lockd", 2*hz);
 682                         if (msgreq.lmr_answered) {
 683                                 /*
 684                                  * Note: it's possible to have a lock granted at
 685                                  * essentially the same time that we get interrupted.
 686                                  * Since the lock may be granted, we can't return an
 687                                  * error from this request or we might not unlock the
 688                                  * lock that's been granted.
 689                                  */
 690                                 error = 0;
 691                                 break;
 692                         }
 693                         if (error != EWOULDBLOCK)
 694                                 break;
 695                         /* check that we still have our mount... */
 696                         /* ...and that we still support locks */
 697                         nmp = VFSTONFS(vp->v_mount);
 698                         if (!nmp || (nmp->nm_flag & NFSMNT_NOLOCKS))
 699                                 break;
 700                         /*
 701                          * If the mount is hung and we've requested not to hang
 702                          * on remote filesystems, then bail now.
 703                          */
 704                         if ((p != NULL) && ((p->p_flag & P_NOREMOTEHANG) != 0) &&
 705                             ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
 706                                 if (fl->l_type == F_UNLCK)
 707                                         printf("nfs_dolock: aborting unlock request "
 708                                             "due to timeout (noremotehang)\n");
 709                                 error = EIO;
 710                                 break;
 711                         }
 712                         microuptime(&now);
 713                 }
 714                 if (error) {
 715                         /* check that we still have our mount... */
 716                         nmp = VFSTONFS(vp->v_mount);
 717                         if (!nmp) {
 718                                 if (error == EWOULDBLOCK)
 719                                         error = ENXIO;
 720                                 break;
 721                         }
 722                         /* ...and that we still support locks */
 723                         if (nmp->nm_flag & NFSMNT_NOLOCKS) {
 724                                 if (error == EWOULDBLOCK)
 725                                         error = EOPNOTSUPP;
 726                                 break;
 727                         }
 728                         if ((error == EOPNOTSUPP) &&
 729                             (nmp->nm_state & NFSSTA_LOCKSWORK)) {
 730                                 /*
 731                                  * We have evidence that locks work, yet lockd
 732                                  * returned EOPNOTSUPP.  This is probably because
 733                                  * it was unable to contact the server's lockd to
 734                                  * send it the request.
 735                                  *
 736                                  * Because we know locks work, we'll consider
 737                                  * this failure to be a timeout.
 738                                  */
 739                                 error = EWOULDBLOCK;
 740                         }
 741                         if (error != EWOULDBLOCK) {
 742                                 /*
 743                                  * We're going to bail on this request.
 744                                  * If we were a blocked lock request, send a cancel.
 745                                  */
 746                                 if ((msgreq.lmr_errno == EINPROGRESS) &&
 747                                     !(msg->lm_flags & LOCKD_MSG_CANCEL)) {
 748                                         /* set this request up as a cancel */
 749                                         msg->lm_flags |= LOCKD_MSG_CANCEL;
 750                                         nfs_lockdmsg_dequeue(&msgreq);
 751                                         msg->lm_xid = nfs_lockxid_get();
 752                                         nfs_lockdmsg_enqueue(&msgreq);
 753                                         msgreq.lmr_saved_errno = error;
 754                                         msgreq.lmr_errno = 0;
 755                                         msgreq.lmr_answered = 0;
 756                                         /* reset timeout */
 757                                         timeo = 2*hz;
 758                                         /* send cancel request */
 759                                         continue;
 760                                 }
 761                                 break;
 762                         }
 763
 764                         /*
 765                          * If the mount is hung and we've requested not to hang
 766                          * on remote filesystems, then bail now.
 767                          */
 768                         if ((p != NULL) && ((p->p_flag & P_NOREMOTEHANG) != 0) &&
 769                             ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
 770                                 if (fl->l_type == F_UNLCK)
 771                                         printf("nfs_dolock: aborting unlock request "
 772                                             "due to timeout (noremotehang)\n");
 773                                 error = EIO;
 774                                 break;
 775                         }
 776                         /* warn if we're not getting any response */
 777                         microuptime(&now);
 778                         if ((msgreq.lmr_errno != EINPROGRESS) &&
 779                             (nmp->nm_tprintf_initial_delay != 0) &&
 780                             ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
 781                                 lastmsg = now.tv_sec;
 782                                 nfs_down(NULL, nmp, p, "lockd not responding",
 783                                         0, NFSSTA_LOCKTIMEO);
 784                                 wentdown = 1;
 785                         }
 786                         if (msgreq.lmr_errno == EINPROGRESS) {
 787                                 /*
 788                                  * We've got a blocked lock request that we are
 789                                  * going to retry.  First, we'll want to try to
 790                                  * send a cancel for the previous request.
 791                                  *
 792                                  * Clear errno so if we don't get a response
 793                                  * to the resend we'll call nfs_down().
 794                                  * Also reset timeout because we'll expect a
 795                                  * quick response to the cancel/resend (even if
 796                                  * it is NLM_BLOCKED).
 797                                  */
 798                                 msg->lm_flags |= LOCKD_MSG_CANCEL;
 799                                 nfs_lockdmsg_dequeue(&msgreq);
 800                                 msg->lm_xid = nfs_lockxid_get();
 801                                 nfs_lockdmsg_enqueue(&msgreq);
 802                                 msgreq.lmr_saved_errno = msgreq.lmr_errno;
 803                                 msgreq.lmr_errno = 0;
 804                                 msgreq.lmr_answered = 0;
 805                                 timeo = 2*hz;
 806                                 /* send cancel then resend request */
 807                                 continue;
 808                         }
 809                         /*
 810                          * We timed out, so we will rewrite the request
 811                          * to the fifo, but only if it isn't already full.
 812                          */
 813                         ioflg |= IO_NDELAY;
 814                         timeo *= 2;
 815                         if (timeo > 60*hz)
 816                                 timeo = 60*hz;
 817                         /* resend request */
 818                         continue;
 819                 }
 820
 821                 if (wentdown) {
 822                         /* we got a reponse, so the server's lockd is OK */
 823                         nfs_up(NULL, VFSTONFS(vp->v_mount), p, "lockd alive again",
 824                                 NFSSTA_LOCKTIMEO);
 825                         wentdown = 0;
 826                 }
 827
 828                 if (msgreq.lmr_errno == EINPROGRESS) {
 829                         /* got NLM_BLOCKED response */
 830                         /* need to wait for NLM_GRANTED */
 831                         timeo = 60*hz;
 832                         msgreq.lmr_answered = 0;
 833                         goto wait_for_granted;
 834                 }
 835
 836                 if ((msg->lm_flags & LOCKD_MSG_CANCEL) &&
 837                     (msgreq.lmr_saved_errno == EINPROGRESS)) {
 838                         /*
 839                          * We just got a successful reply to the
 840                          * cancel of the previous blocked lock request.
 841                          * Now, go ahead and resend the request.
 842                          */
 843                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 844                         nfs_lockdmsg_dequeue(&msgreq);
 845                         msg->lm_xid = nfs_lockxid_get();
 846                         nfs_lockdmsg_enqueue(&msgreq);
 847                         msgreq.lmr_saved_errno = 0;
 848                         msgreq.lmr_errno = 0;
 849                         msgreq.lmr_answered = 0;
 850                         timeo = 2*hz;
 851                         /* resend request */
 852                         continue;
 853                 }
 854
 855                 if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) {
 856                         if (msg->lm_fl.l_type != F_UNLCK) {
 857                                 fl->l_type = msg->lm_fl.l_type;
 858                                 fl->l_pid = msg->lm_fl.l_pid;
 859                                 fl->l_start = msg->lm_fl.l_start;
 860                                 fl->l_len = msg->lm_fl.l_len;
 861                                 fl->l_whence = SEEK_SET;
 862                         } else {
 863                                 fl->l_type = F_UNLCK;
 864                         }
 865                 }
 866
 867                 /*
 868                  * If the blocked lock request was cancelled.
 869                  * Restore the error condition from when we
 870                  * originally bailed on the request.
 871                  */
 872                 if (msg->lm_flags & LOCKD_MSG_CANCEL) {
 873                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 874                         error = msgreq.lmr_saved_errno;
 875                 } else
 876                         error = msgreq.lmr_errno;
 877
 878                 if (!error) {
 879                         /* record that NFS file locking has worked on this mount */
 880                         nmp = VFSTONFS(vp->v_mount);
 881                         if (nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK))
 882                                 nmp->nm_state |= NFSSTA_LOCKSWORK;
 883                         /*
 884                          * If we successfully acquired a lock, make sure this pid
 885                          * is in the nfs_lock_pid hash table so we know we can't
 886                          * short-circuit unlock requests.
 887                          */
 888                         if ((lockpidcheck == ENOENT) &&
 889                             ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW)))
 890                                 nfs_lock_pid_check(p, 1, vp);
 891
 892                 }
 893                 break;
 894         }
 895
 896         nfs_lockdmsg_dequeue(&msgreq);
 897
 898         error1 = vn_close(wvp, FWRITE, kernproc->p_ucred, p);
 899         /* prefer any previous 'error' to our vn_close 'error1'. */
 900         return (error != 0 ? error : error1);
 901 }
 902
 903 /*
 904  * nfslockdans --
 905  *      NFS advisory byte-level locks answer from the lock daemon.
 906  */
 907 int
 908 nfslockdans(struct proc *p, struct lockd_ans *ansp)
 909 {
 910         LOCKD_MSG_REQUEST *msgreq;
 911         int error;
 912
 913         /*
 914          * Let root, or someone who once was root (lockd generally
 915          * switches to the daemon uid once it is done setting up) make
 916          * this call.
 917          *
 918          * XXX This authorization check is probably not right.
 919          */
 920         if ((error = suser(p->p_ucred, &p->p_acflag)) != 0 &&
 921             p->p_cred->p_svuid != 0)
 922                 return (error);
 923
 924         /* the version should match, or we're out of sync */
 925         if (ansp->la_version != LOCKD_ANS_VERSION)
 926                 return (EINVAL);
 927
 928         /* try to find the lockd message by transaction id (cookie) */
 929         msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
 930         if (ansp->la_flags & LOCKD_ANS_GRANTED) {
 931                 /*
 932                  * We can't depend on the granted message having our cookie,
 933                  * so we check the answer against the lockd message found.
 934                  * If no message was found or it doesn't match the answer,
 935                  * we look for the lockd message by the answer's lock info.
 936                  */
 937                 if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp))
 938                         msgreq = nfs_lockdmsg_find_by_answer(ansp);
 939                 /*
 940                  * We need to make sure this request isn't being cancelled
 941                  * If it is, we don't want to accept the granted message.
 942                  */
 943                 if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL))
 944                         msgreq = NULL;
 945         }
 946         if (!msgreq)
 947                 return (EPIPE);
 948
 949         msgreq->lmr_errno = ansp->la_errno;
 950         if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) {
 951                 if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) {
 952                         if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL)
 953                                 msgreq->lmr_msg.lm_fl.l_type = F_WRLCK;
 954                         else
 955                                 msgreq->lmr_msg.lm_fl.l_type = F_RDLCK;
 956                         msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid;
 957                         msgreq->lmr_msg.lm_fl.l_start = ansp->la_start;
 958                         msgreq->lmr_msg.lm_fl.l_len = ansp->la_len;
 959                 } else {
 960                         msgreq->lmr_msg.lm_fl.l_type = F_UNLCK;
 961                 }
 962         }
 963
 964         msgreq->lmr_answered = 1;
 965         (void)wakeup((void *)msgreq);
 966
 967         return (0);
 968 }
 969
 970 /*
 971  * nfslockdfd --
 972  *      NFS advisory byte-level locks: fifo file# from the lock daemon.
 973  */
 974 int
 975 nfslockdfd(struct proc *p, int fd)
 976 {
 977         int error;
 978         struct file *fp, *ofp;
 979
 980         error = suser(p->p_ucred, &p->p_acflag);
 981         if (error)
 982                 return (error);
 983         if (fd < 0) {
 984                 fp = 0;
 985         } else {
 986                 error = getvnode(p, fd, &fp);
 987                 if (error)
 988                         return (error);
 989                 (void)fref(fp);
 990         }
 991         ofp = nfslockdfp;
 992         nfslockdfp = fp;
 993         if (ofp)
 994                 (void)frele(ofp);
 995         nfslockdpid = nfslockdfp ? p->p_pid : 0;
 996         (void)wakeup((void *)&nfslockdfp);
 997         return (0);
 998 }
 999
1000 /*
1001  * nfslockdwait --
1002  *      lock daemon waiting for lock request
1003  */
1004 int
1005 nfslockdwait(struct proc *p)
1006 {
1007         int error;
1008         struct file *fp, *ofp;
1009
1010         if (p->p_pid != nfslockdpid) {
1011                 error = suser(p->p_ucred, &p->p_acflag);
1012                 if (error)
1013                         return (error);
1014         }
1015         if (nfslockdwaiting)
1016                 return (EBUSY);
1017         if (nfslockdfifowritten) {
1018                 nfslockdfifowritten = 0;
1019                 return (0);
1020         }
1021
1022         nfslockdwaiting = 1;
1023         tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0);
1024         nfslockdwaiting = 0;
1025
1026         return (0);
1027 }