bsd/nfs/nfs_lock.c

   1 /*
   2  * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*-
  23  * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
  24  *
  25  * Redistribution and use in source and binary forms, with or without
  26  * modification, are permitted provided that the following conditions
  27  * are met:
  28  * 1. Redistributions of source code must retain the above copyright
  29  *    notice, this list of conditions and the following disclaimer.
  30  * 2. Redistributions in binary form must reproduce the above copyright
  31  *    notice, this list of conditions and the following disclaimer in the
  32  *    documentation and/or other materials provided with the distribution.
  33  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  34  *    promote products derived from this software without specific prior
  35  *    written permission.
  36  *
  37  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  38  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  39  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  40  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  41  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  42  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  43  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  44  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  45  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  46  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47  * SUCH DAMAGE.
  48  *
  49  *      from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
  50  */
  51
  52 #include <sys/cdefs.h>
  53 #include <sys/param.h>
  54 #include <sys/systm.h>
  55 #include <sys/fcntl.h>
  56 #include <sys/kernel.h>         /* for hz */
  57 #include <sys/file_internal.h>
  58 #include <sys/malloc.h>
  59 #include <sys/lockf.h>          /* for hz */ /* Must come after sys/malloc.h */
  60 #include <sys/kpi_mbuf.h>
  61 #include <sys/mount_internal.h>
  62 #include <sys/proc_internal.h>  /* for p_start */
  63 #include <sys/kauth.h>
  64 #include <sys/resourcevar.h>
  65 #include <sys/socket.h>
  66 #include <sys/unistd.h>
  67 #include <sys/user.h>
  68 #include <sys/vnode_internal.h>
  69
  70 #include <kern/thread.h>
  71
  72 #include <machine/limits.h>
  73
  74 #include <net/if.h>
  75
  76 #include <nfs/rpcv2.h>
  77 #include <nfs/nfsproto.h>
  78 #include <nfs/nfs.h>
  79 #include <nfs/nfsmount.h>
  80 #include <nfs/nfsnode.h>
  81 #include <nfs/nfs_lock.h>
  82
  83 #define OFF_MAX QUAD_MAX
  84
  85 /*
  86  * globals for managing the lockd fifo
  87  */
  88 vnode_t nfslockdvnode = 0;
  89 int nfslockdwaiting = 0;
  90 time_t nfslockdstarttimeout = 0;
  91 int nfslockdfifolock = 0;
  92 #define NFSLOCKDFIFOLOCK_LOCKED 1
  93 #define NFSLOCKDFIFOLOCK_WANT   2
  94
  95 /*
  96  * pending lock request messages are kept in this queue which is
  97  * kept sorted by transaction ID (xid).
  98  */
  99 uint64_t nfs_lockxid = 0;
 100 LOCKD_MSG_QUEUE nfs_pendlockq;
 101
 102 /*
 103  * This structure is used to identify processes which have acquired NFS locks.
 104  * Knowing which processes have ever acquired locks allows us to short-circuit
 105  * unlock requests for processes that have never had an NFS file lock.  Thus
 106  * avoiding a costly and unnecessary lockd request.
 107  */
 108 struct nfs_lock_pid {
 109         TAILQ_ENTRY(nfs_lock_pid)       lp_lru;         /* LRU list */
 110         LIST_ENTRY(nfs_lock_pid)        lp_hash;        /* hash chain */
 111         int                             lp_valid;       /* valid entry? */
 112         int                             lp_time;        /* last time seen valid */
 113         pid_t                           lp_pid;         /* The process ID. */
 114         struct timeval                  lp_pid_start;   /* Start time of process id */
 115 };
 116
 117 #define NFS_LOCK_PID_HASH_SIZE          64      // XXX tune me
 118 #define NFS_LOCK_PID_HASH(pid)  \
 119         (&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash])
 120 LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl;
 121 TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru;
 122 u_long nfs_lock_pid_hash;
 123 int nfs_lock_pid_lock;
 124
 125
 126 /*
 127  * initialize global nfs lock state
 128  */
 129 void
 130 nfs_lockinit(void)
 131 {
 132         TAILQ_INIT(&nfs_pendlockq);
 133         nfs_lock_pid_lock = 0;
 134         nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE,
 135                                          M_TEMP, &nfs_lock_pid_hash);
 136         TAILQ_INIT(&nfs_lock_pid_lru);
 137 }
 138
 139 /*
 140  * insert a lock request message into the pending queue
 141  */
 142 static inline void
 143 nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq)
 144 {
 145         LOCKD_MSG_REQUEST *mr;
 146
 147         mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue);
 148         if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 149                 /* fast path: empty queue or new largest xid */
 150                 TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next);
 151                 return;
 152         }
 153         /* slow path: need to walk list to find insertion point */
 154         while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 155                 mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next);
 156         }
 157         if (mr) {
 158                 TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next);
 159         } else {
 160                 TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next);
 161         }
 162 }
 163
 164 /*
 165  * remove a lock request message from the pending queue
 166  */
 167 static inline void
 168 nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq)
 169 {
 170         TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next);
 171 }
 172
 173 /*
 174  * find a pending lock request message by xid
 175  *
 176  * We search from the head of the list assuming that the message we're
 177  * looking for is for an older request (because we have an answer to it).
 178  * This assumes that lock request will be answered primarily in FIFO order.
 179  * However, this may not be the case if there are blocked requests.  We may
 180  * want to move blocked requests to a separate queue (but that'll complicate
 181  * duplicate xid checking).
 182  */
 183 static inline LOCKD_MSG_REQUEST *
 184 nfs_lockdmsg_find_by_xid(uint64_t lockxid)
 185 {
 186         LOCKD_MSG_REQUEST *mr;
 187
 188         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 189                 if (mr->lmr_msg.lm_xid == lockxid)
 190                         return mr;
 191                 if (mr->lmr_msg.lm_xid > lockxid)
 192                         return NULL;
 193         }
 194         return mr;
 195 }
 196
 197 /*
 198  * Because we can't depend on nlm_granted messages containing the same
 199  * cookie we sent with the original lock request, we need code test if
 200  * an nlm_granted answer matches the lock request.  We also need code
 201  * that can find a lockd message based solely on the nlm_granted answer.
 202  */
 203
 204 /*
 205  * compare lockd message to answer
 206  *
 207  * returns 0 on equality and 1 if different
 208  */
 209 static inline int
 210 nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp)
 211 {
 212         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 213                 return 1;
 214         if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid)
 215                 return 1;
 216         if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start)
 217                 return 1;
 218         if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len)
 219                 return 1;
 220         if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len)
 221                 return 1;
 222         if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len))
 223                 return 1;
 224         return 0;
 225 }
 226
 227 /*
 228  * find a pending lock request message based on the lock info provided
 229  * in the lockd_ans/nlm_granted data.  We need this because we can't
 230  * depend on nlm_granted messages containing the same cookie we sent
 231  * with the original lock request.
 232  *
 233  * We search from the head of the list assuming that the message we're
 234  * looking for is for an older request (because we have an answer to it).
 235  * This assumes that lock request will be answered primarily in FIFO order.
 236  * However, this may not be the case if there are blocked requests.  We may
 237  * want to move blocked requests to a separate queue (but that'll complicate
 238  * duplicate xid checking).
 239  */
 240 static inline LOCKD_MSG_REQUEST *
 241 nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp)
 242 {
 243         LOCKD_MSG_REQUEST *mr;
 244
 245         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 246                 return NULL;
 247         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 248                 if (!nfs_lockdmsg_compare_to_answer(mr, ansp))
 249                         break;
 250         }
 251         return mr;
 252 }
 253
 254 /*
 255  * return the next unique lock request transaction ID
 256  */
 257 static inline uint64_t
 258 nfs_lockxid_get(void)
 259 {
 260         LOCKD_MSG_REQUEST *mr;
 261
 262         /* derive initial lock xid from system time */
 263         if (!nfs_lockxid) {
 264                 /*
 265                  * Note: it's OK if this code inits nfs_lockxid to 0 (for example,
 266                  * due to a broken clock) because we immediately increment it
 267                  * and we guarantee to never use xid 0.  So, nfs_lockxid should only
 268                  * ever be 0 the first time this function is called.
 269                  */
 270                 struct timeval tv;
 271                 microtime(&tv);
 272                 nfs_lockxid = (uint64_t)tv.tv_sec << 12;
 273         }
 274
 275         /* make sure we get a unique xid */
 276         do {
 277                 /* Skip zero xid if it should ever happen.  */
 278                 if (++nfs_lockxid == 0)
 279                         nfs_lockxid++;
 280                 if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) ||
 281                      (mr->lmr_msg.lm_xid < nfs_lockxid)) {
 282                         /* fast path: empty queue or new largest xid */
 283                         break;
 284                 }
 285                 /* check if xid is already in use */
 286         } while (nfs_lockdmsg_find_by_xid(nfs_lockxid));
 287
 288         return nfs_lockxid;
 289 }
 290
 291
 292 /*
 293  * Check the nfs_lock_pid hash table for an entry and, if requested,
 294  * add the entry if it is not found.
 295  *
 296  * (Also, if adding, try to clean up some stale entries.)
 297  */
 298 static int
 299 nfs_lock_pid_check(proc_t p, int addflag, vnode_t vp)
 300 {
 301         struct nfs_lock_pid *lp, *lplru, *lplru_next;
 302         proc_t plru;
 303         int error = 0;
 304         struct timeval now;
 305
 306         /* lock hash */
 307 loop:
 308         if (nfs_lock_pid_lock) {
 309                 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 310                 while (nfs_lock_pid_lock) {
 311                         nfs_lock_pid_lock = -1;
 312                         tsleep(&nfs_lock_pid_lock, PCATCH, "nfslockpid", 0);
 313                         if ((error = nfs_sigintr(nmp, NULL, p)))
 314                                 return (error);
 315                 }
 316                 goto loop;
 317         }
 318         nfs_lock_pid_lock = 1;
 319
 320         /* Search hash chain */
 321         error = ENOENT;
 322         lp = NFS_LOCK_PID_HASH(proc_pid(p))->lh_first;
 323         for (; lp != NULL; lp = lp->lp_hash.le_next)
 324                 if (lp->lp_pid == proc_pid(p)) {
 325                         /* found pid... */
 326                         if (timevalcmp(&lp->lp_pid_start, &p->p_stats->p_start, ==)) {
 327                                 /* ...and it's valid */
 328                                 /* move to tail of LRU */
 329                                 TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 330                                 microuptime(&now);
 331                                 lp->lp_time = now.tv_sec;
 332                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 333                                 error = 0;
 334                                 break;
 335                         }
 336                         /* ...but it's no longer valid */
 337                         /* remove from hash, invalidate, and move to lru head */
 338                         LIST_REMOVE(lp, lp_hash);
 339                         lp->lp_valid = 0;
 340                         TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 341                         TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru);
 342                         lp = NULL;
 343                         break;
 344                 }
 345
 346         /* if we didn't find it (valid) and we've been asked to add it */
 347         if ((error == ENOENT) && addflag) {
 348                 /* scan lru list for invalid, stale entries to reuse/free */
 349                 int lrucnt = 0;
 350                 microuptime(&now);
 351                 for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) {
 352                         lplru_next = TAILQ_NEXT(lplru, lp_lru);
 353                         if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) {
 354                                 /*
 355                                  * If the oldest LRU entry is relatively new, then don't
 356                                  * bother scanning any further.
 357                                  */
 358                                 break;
 359                         }
 360                         /* remove entry from LRU, and check if it's still in use */
 361                         TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru);
 362                         if (!lplru->lp_valid || !(plru = pfind(lplru->lp_pid)) ||
 363                             timevalcmp(&lplru->lp_pid_start, &plru->p_stats->p_start, !=)) {
 364                                 /* no longer in use */
 365                                 LIST_REMOVE(lplru, lp_hash);
 366                                 if (!lp) {
 367                                         /* we'll reuse this one */
 368                                         lp = lplru;
 369                                 } else {
 370                                         /* we can free this one */
 371                                         FREE(lplru, M_TEMP);
 372                                 }
 373                         } else {
 374                                 /* still in use */
 375                                 lplru->lp_time = now.tv_sec;
 376                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru);
 377                         }
 378                         /* don't check too many entries at once */
 379                         if (++lrucnt > 8)
 380                                 break;
 381                 }
 382                 if (!lp) {
 383                         /* we need to allocate a new one */
 384                         MALLOC(lp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid),
 385                                 M_TEMP, M_WAITOK | M_ZERO);
 386                 }
 387                 if (!lp) {
 388                         error = ENOMEM;
 389                 } else {
 390                         /* (re)initialize nfs_lock_pid info */
 391                         lp->lp_pid = proc_pid(p);
 392                         lp->lp_pid_start = p->p_stats->p_start;
 393                         /* insert pid in hash */
 394                         LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash);
 395                         lp->lp_valid = 1;
 396                         lp->lp_time = now.tv_sec;
 397                         TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 398                         error = 0;
 399                 }
 400         }
 401
 402         /* unlock hash */
 403         if (nfs_lock_pid_lock < 0) {
 404                 nfs_lock_pid_lock = 0;
 405                 wakeup(&nfs_lock_pid_lock);
 406         } else
 407                 nfs_lock_pid_lock = 0;
 408
 409         return (error);
 410 }
 411
 412
 413 /*
 414  * nfs_advlock --
 415  *      NFS advisory byte-level locks.
 416  */
 417 int
 418 nfs_dolock(struct vnop_advlock_args *ap)
 419 /* struct vnop_advlock_args {
 420         struct vnodeop_desc *a_desc;
 421         vnode_t a_vp;
 422         caddr_t a_id;
 423         int a_op;
 424         struct flock *a_fl;
 425         int a_flags;
 426         vfs_context_t a_context;
 427 }; */
 428 {
 429         LOCKD_MSG_REQUEST msgreq;
 430         LOCKD_MSG *msg;
 431         vnode_t vp, wvp;
 432         struct nfsnode *np;
 433         int error, error1;
 434         struct flock *fl;
 435         int fmode, ioflg;
 436         struct nfsmount *nmp;
 437         struct nfs_vattr nvattr;
 438         off_t start, end;
 439         struct timeval now;
 440         int timeo, endtime, lastmsg, wentdown = 0;
 441         int lockpidcheck;
 442         kauth_cred_t cred;
 443         proc_t p;
 444         struct sockaddr *saddr;
 445
 446         p = vfs_context_proc(ap->a_context);
 447         cred = vfs_context_ucred(ap->a_context);
 448
 449         vp = ap->a_vp;
 450         fl = ap->a_fl;
 451         np = VTONFS(vp);
 452
 453         nmp = VFSTONFS(vnode_mount(vp));
 454         if (!nmp)
 455                 return (ENXIO);
 456         if (nmp->nm_flag & NFSMNT_NOLOCKS)
 457                 return (ENOTSUP);
 458
 459         /*
 460          * The NLM protocol doesn't allow the server to return an error
 461          * on ranges, so we do it.  Pre LFS (Large File Summit)
 462          * standards required EINVAL for the range errors.  More recent
 463          * standards use EOVERFLOW, but their EINVAL wording still
 464          * encompasses these errors.
 465          * Any code sensitive to this is either:
 466          *  1) written pre-LFS and so can handle only EINVAL, or
 467          *  2) written post-LFS and thus ought to be tolerant of pre-LFS
 468          *     implementations.
 469          * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
 470          */
 471         if (fl->l_whence != SEEK_END) {
 472                 if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
 473                     fl->l_start < 0 ||
 474                     (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
 475                     (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
 476                         return (EINVAL);
 477         }
 478         /*
 479          * If daemon is running take a ref on its fifo vnode
 480          */
 481         if (!(wvp = nfslockdvnode)) {
 482                 if (!nfslockdwaiting && !nfslockdstarttimeout)
 483                         return (ENOTSUP);
 484                 /*
 485                  * Don't wake lock daemon if it hasn't been started yet and
 486                  * this is an unlock request (since we couldn't possibly
 487                  * actually have a lock on the file).  This could be an
 488                  * uninformed unlock request due to closef()'s behavior of doing
 489                  * unlocks on all files if a process has had a lock on ANY file.
 490                  */
 491                 if (!nfslockdvnode && (fl->l_type == F_UNLCK))
 492                         return (EINVAL);
 493                 microuptime(&now);
 494                 if (nfslockdwaiting) {
 495                         /* wake up lock daemon */
 496                         nfslockdstarttimeout = now.tv_sec + 60;
 497                         (void)wakeup((void *)&nfslockdwaiting);
 498                 }
 499                 /* wait on nfslockdvnode for a while to allow daemon to start */
 500                 while (!nfslockdvnode && (now.tv_sec < nfslockdstarttimeout)) {
 501                         error = tsleep((void *)&nfslockdvnode, PCATCH | PUSER, "lockdstart", 2*hz);
 502                         if (error && (error != EWOULDBLOCK))
 503                                 return (error);
 504                         /* check that we still have our mount... */
 505                         /* ...and that we still support locks */
 506                         nmp = VFSTONFS(vnode_mount(vp));
 507                         if (!nmp)
 508                                 return (ENXIO);
 509                         if (nmp->nm_flag & NFSMNT_NOLOCKS)
 510                                 return (ENOTSUP);
 511                         if (!error)
 512                                 break;
 513                         microuptime(&now);
 514                 }
 515                 /*
 516                  * check for nfslockdvnode
 517                  * If it hasn't started by now, there's a problem.
 518                  */
 519                 if (!(wvp = nfslockdvnode))
 520                         return (ENOTSUP);
 521         }
 522         error = vnode_getwithref(wvp);
 523         if (error)
 524                 return (ENOTSUP);
 525         error = vnode_ref(wvp);
 526         if (error) {
 527                 vnode_put(wvp);
 528                 return (ENOTSUP);
 529         }
 530
 531         /*
 532          * Need to check if this process has successfully acquired an NFS lock before.
 533          * If not, and this is an unlock request we can simply return success here.
 534          */
 535         lockpidcheck = nfs_lock_pid_check(p, 0, vp);
 536         if (lockpidcheck) {
 537                 if (lockpidcheck != ENOENT) {
 538                         vnode_rele(wvp);
 539                         vnode_put(wvp);
 540                         return (lockpidcheck);
 541                 }
 542                 if (ap->a_op == F_UNLCK) {
 543                         vnode_rele(wvp);
 544                         vnode_put(wvp);
 545                         return (0);
 546                 }
 547         }
 548
 549         /*
 550          * The NFS Lock Manager protocol doesn't directly handle
 551          * negative lengths or SEEK_END, so we need to normalize
 552          * things here where we have all the info.
 553          * (Note: SEEK_CUR is already adjusted for at this point)
 554          */
 555         /* Convert the flock structure into a start and end. */
 556         switch (fl->l_whence) {
 557         case SEEK_SET:
 558         case SEEK_CUR:
 559                 /*
 560                  * Caller is responsible for adding any necessary offset
 561                  * to fl->l_start when SEEK_CUR is used.
 562                  */
 563                 start = fl->l_start;
 564                 break;
 565         case SEEK_END:
 566                 /* need to flush, and refetch attributes to make */
 567                 /* sure we have the correct end of file offset   */
 568                 if (np->n_flag & NMODIFIED) {
 569                         NATTRINVALIDATE(np);
 570                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 571                         if (error) {
 572                                 vnode_rele(wvp);
 573                                 vnode_put(wvp);
 574                                 return (error);
 575                         }
 576                 }
 577                 NATTRINVALIDATE(np);
 578
 579                 error = nfs_getattr(vp, &nvattr, cred, p);
 580                 if (error) {
 581                         vnode_rele(wvp);
 582                         vnode_put(wvp);
 583                         return (error);
 584                 }
 585                 start = np->n_size + fl->l_start;
 586                 break;
 587         default:
 588                 vnode_rele(wvp);
 589                 vnode_put(wvp);
 590                 return (EINVAL);
 591         }
 592         if (fl->l_len == 0)
 593                 end = -1;
 594         else if (fl->l_len > 0)
 595                 end = start + fl->l_len - 1;
 596         else { /* l_len is negative */
 597                 end = start - 1;
 598                 start += fl->l_len;
 599         }
 600         if (start < 0) {
 601                 vnode_rele(wvp);
 602                 vnode_put(wvp);
 603                 return (EINVAL);
 604         }
 605         if (!NFS_ISV3(vp) &&
 606             ((start >= 0x80000000) || (end >= 0x80000000))) {
 607                 vnode_rele(wvp);
 608                 vnode_put(wvp);
 609                 return (EINVAL);
 610         }
 611
 612         /*
 613          * Fill in the information structure.
 614          */
 615         msgreq.lmr_answered = 0;
 616         msgreq.lmr_errno = 0;
 617         msgreq.lmr_saved_errno = 0;
 618         msg = &msgreq.lmr_msg;
 619         msg->lm_version = LOCKD_MSG_VERSION;
 620         msg->lm_flags = 0;
 621
 622         msg->lm_fl = *fl;
 623         msg->lm_fl.l_start = start;
 624         if (end != -1)
 625                 msg->lm_fl.l_len = end - start + 1;
 626         msg->lm_fl.l_pid = proc_pid(p);
 627
 628         if (ap->a_flags & F_WAIT)
 629                 msg->lm_flags |= LOCKD_MSG_BLOCK;
 630         if (ap->a_op == F_GETLK)
 631                 msg->lm_flags |= LOCKD_MSG_TEST;
 632
 633         nmp = VFSTONFS(vnode_mount(vp));
 634         if (!nmp) {
 635                 vnode_rele(wvp);
 636                 vnode_put(wvp);
 637                 return (ENXIO);
 638         }
 639
 640         saddr = mbuf_data(nmp->nm_nam);
 641         bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len));
 642         msg->lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
 643         bcopy(VTONFS(vp)->n_fhp, msg->lm_fh, msg->lm_fh_len);
 644         if (NFS_ISV3(vp))
 645                 msg->lm_flags |= LOCKD_MSG_NFSV3;
 646         cru2x(cred, &msg->lm_cred);
 647
 648         microuptime(&now);
 649         lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
 650
 651         fmode = FFLAGS(O_WRONLY);
 652         if ((error = VNOP_OPEN(wvp, fmode, ap->a_context))) {
 653                 vnode_rele(wvp);
 654                 vnode_put(wvp);
 655                 return (error);
 656         }
 657         vnode_lock(wvp);
 658         ++wvp->v_writecount;
 659         vnode_unlock(wvp);
 660
 661         /* allocate unique xid */
 662         msg->lm_xid = nfs_lockxid_get();
 663         nfs_lockdmsg_enqueue(&msgreq);
 664
 665         timeo = 2*hz;
 666 #define IO_NOMACCHECK 0;
 667         ioflg = IO_UNIT | IO_NOMACCHECK;
 668         for (;;) {
 669                 error = 0;
 670                 while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) {
 671                         nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT;
 672                         error = tsleep((void *)&nfslockdfifolock,
 673                                         PCATCH | PUSER, "lockdfifo", 20*hz);
 674                         if (error)
 675                                 break;
 676                 }
 677                 if (error)
 678                         break;
 679                 nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED;
 680
 681                 error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)msg, sizeof(*msg), 0,
 682                     UIO_SYSSPACE32, ioflg, proc_ucred(kernproc), NULL, p);
 683
 684                 nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED;
 685                 if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) {
 686                         nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT;
 687                         wakeup((void *)&nfslockdfifolock);
 688                 }
 689
 690                 if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) {
 691                         break;
 692                 }
 693
 694                 /*
 695                  * Always wait for an answer.  Not waiting for unlocks could
 696                  * cause a lock to be left if the unlock request gets dropped.
 697                  */
 698
 699                 /*
 700                  * Retry if it takes too long to get a response.
 701                  *
 702                  * The timeout numbers were picked out of thin air... they start
 703                  * at 2 and double each timeout with a max of 60 seconds.
 704                  *
 705                  * In order to maintain responsiveness, we pass a small timeout
 706                  * to tsleep and calculate the timeouts ourselves.  This allows
 707                  * us to pick up on mount changes quicker.
 708                  */
 709 wait_for_granted:
 710                 error = EWOULDBLOCK;
 711                 microuptime(&now);
 712                 if ((timeo/hz) > 0)
 713                         endtime = now.tv_sec + timeo/hz;
 714                 else
 715                         endtime = now.tv_sec + 1;
 716                 while (now.tv_sec < endtime) {
 717                         error = tsleep((void *)&msgreq, PCATCH | PUSER, "lockd", 2*hz);
 718                         if (msgreq.lmr_answered) {
 719                                 /*
 720                                  * Note: it's possible to have a lock granted at
 721                                  * essentially the same time that we get interrupted.
 722                                  * Since the lock may be granted, we can't return an
 723                                  * error from this request or we might not unlock the
 724                                  * lock that's been granted.
 725                                  */
 726                                 error = 0;
 727                                 break;
 728                         }
 729                         if (error != EWOULDBLOCK)
 730                                 break;
 731                         /* check that we still have our mount... */
 732                         /* ...and that we still support locks */
 733                         nmp = VFSTONFS(vnode_mount(vp));
 734                         if (!nmp || (nmp->nm_flag & NFSMNT_NOLOCKS))
 735                                 break;
 736                         /*
 737                          * If the mount is hung and we've requested not to hang
 738                          * on remote filesystems, then bail now.
 739                          */
 740                         if ((p != NULL) && ((proc_noremotehang(p)) != 0) &&
 741                             ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
 742                                 if (fl->l_type == F_UNLCK)
 743                                         printf("nfs_dolock: aborting unlock request "
 744                                             "due to timeout (noremotehang)\n");
 745                                 error = EIO;
 746                                 break;
 747                         }
 748                         microuptime(&now);
 749                 }
 750                 if (error) {
 751                         /* check that we still have our mount... */
 752                         nmp = VFSTONFS(vnode_mount(vp));
 753                         if (!nmp) {
 754                                 if (error == EWOULDBLOCK)
 755                                         error = ENXIO;
 756                                 break;
 757                         }
 758                         /* ...and that we still support locks */
 759                         if (nmp->nm_flag & NFSMNT_NOLOCKS) {
 760                                 if (error == EWOULDBLOCK)
 761                                         error = ENOTSUP;
 762                                 break;
 763                         }
 764                         if ((error == ENOTSUP) &&
 765                             (nmp->nm_state & NFSSTA_LOCKSWORK)) {
 766                                 /*
 767                                  * We have evidence that locks work, yet lockd
 768                                  * returned ENOTSUP.  This is probably because
 769                                  * it was unable to contact the server's lockd to
 770                                  * send it the request.
 771                                  *
 772                                  * Because we know locks work, we'll consider
 773                                  * this failure to be a timeout.
 774                                  */
 775                                 error = EWOULDBLOCK;
 776                         }
 777                         if (error != EWOULDBLOCK) {
 778                                 /*
 779                                  * We're going to bail on this request.
 780                                  * If we were a blocked lock request, send a cancel.
 781                                  */
 782                                 if ((msgreq.lmr_errno == EINPROGRESS) &&
 783                                     !(msg->lm_flags & LOCKD_MSG_CANCEL)) {
 784                                         /* set this request up as a cancel */
 785                                         msg->lm_flags |= LOCKD_MSG_CANCEL;
 786                                         nfs_lockdmsg_dequeue(&msgreq);
 787                                         msg->lm_xid = nfs_lockxid_get();
 788                                         nfs_lockdmsg_enqueue(&msgreq);
 789                                         msgreq.lmr_saved_errno = error;
 790                                         msgreq.lmr_errno = 0;
 791                                         msgreq.lmr_answered = 0;
 792                                         /* reset timeout */
 793                                         timeo = 2*hz;
 794                                         /* send cancel request */
 795                                         continue;
 796                                 }
 797                                 break;
 798                         }
 799
 800                         /*
 801                          * If the mount is hung and we've requested not to hang
 802                          * on remote filesystems, then bail now.
 803                          */
 804                         if ((p != NULL) && ((proc_noremotehang(p)) != 0) &&
 805                             ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
 806                                 if (fl->l_type == F_UNLCK)
 807                                         printf("nfs_dolock: aborting unlock request "
 808                                             "due to timeout (noremotehang)\n");
 809                                 error = EIO;
 810                                 break;
 811                         }
 812                         /* warn if we're not getting any response */
 813                         microuptime(&now);
 814                         if ((msgreq.lmr_errno != EINPROGRESS) &&
 815                             (nmp->nm_tprintf_initial_delay != 0) &&
 816                             ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
 817                                 lastmsg = now.tv_sec;
 818                                 nfs_down(nmp, p, 0, NFSSTA_LOCKTIMEO, "lockd not responding");
 819                                 wentdown = 1;
 820                         }
 821                         if (msgreq.lmr_errno == EINPROGRESS) {
 822                                 /*
 823                                  * We've got a blocked lock request that we are
 824                                  * going to retry.  First, we'll want to try to
 825                                  * send a cancel for the previous request.
 826                                  *
 827                                  * Clear errno so if we don't get a response
 828                                  * to the resend we'll call nfs_down().
 829                                  * Also reset timeout because we'll expect a
 830                                  * quick response to the cancel/resend (even if
 831                                  * it is NLM_BLOCKED).
 832                                  */
 833                                 msg->lm_flags |= LOCKD_MSG_CANCEL;
 834                                 nfs_lockdmsg_dequeue(&msgreq);
 835                                 msg->lm_xid = nfs_lockxid_get();
 836                                 nfs_lockdmsg_enqueue(&msgreq);
 837                                 msgreq.lmr_saved_errno = msgreq.lmr_errno;
 838                                 msgreq.lmr_errno = 0;
 839                                 msgreq.lmr_answered = 0;
 840                                 timeo = 2*hz;
 841                                 /* send cancel then resend request */
 842                                 continue;
 843                         }
 844                         /*
 845                          * We timed out, so we will rewrite the request
 846                          * to the fifo, but only if it isn't already full.
 847                          */
 848                         ioflg |= IO_NDELAY;
 849                         timeo *= 2;
 850                         if (timeo > 60*hz)
 851                                 timeo = 60*hz;
 852                         /* resend request */
 853                         continue;
 854                 }
 855
 856                 /* we got a reponse, so the server's lockd is OK */
 857                 nfs_up(VFSTONFS(vnode_mount(vp)), p, NFSSTA_LOCKTIMEO,
 858                         wentdown ? "lockd alive again" : NULL);
 859                 wentdown = 0;
 860
 861                 if (msgreq.lmr_errno == EINPROGRESS) {
 862                         /* got NLM_BLOCKED response */
 863                         /* need to wait for NLM_GRANTED */
 864                         timeo = 60*hz;
 865                         msgreq.lmr_answered = 0;
 866                         goto wait_for_granted;
 867                 }
 868
 869                 if ((msg->lm_flags & LOCKD_MSG_CANCEL) &&
 870                     (msgreq.lmr_saved_errno == EINPROGRESS)) {
 871                         /*
 872                          * We just got a successful reply to the
 873                          * cancel of the previous blocked lock request.
 874                          * Now, go ahead and resend the request.
 875                          */
 876                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 877                         nfs_lockdmsg_dequeue(&msgreq);
 878                         msg->lm_xid = nfs_lockxid_get();
 879                         nfs_lockdmsg_enqueue(&msgreq);
 880                         msgreq.lmr_saved_errno = 0;
 881                         msgreq.lmr_errno = 0;
 882                         msgreq.lmr_answered = 0;
 883                         timeo = 2*hz;
 884                         /* resend request */
 885                         continue;
 886                 }
 887
 888                 if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) {
 889                         if (msg->lm_fl.l_type != F_UNLCK) {
 890                                 fl->l_type = msg->lm_fl.l_type;
 891                                 fl->l_pid = msg->lm_fl.l_pid;
 892                                 fl->l_start = msg->lm_fl.l_start;
 893                                 fl->l_len = msg->lm_fl.l_len;
 894                                 fl->l_whence = SEEK_SET;
 895                         } else {
 896                                 fl->l_type = F_UNLCK;
 897                         }
 898                 }
 899
 900                 /*
 901                  * If the blocked lock request was cancelled.
 902                  * Restore the error condition from when we
 903                  * originally bailed on the request.
 904                  */
 905                 if (msg->lm_flags & LOCKD_MSG_CANCEL) {
 906                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 907                         error = msgreq.lmr_saved_errno;
 908                 } else
 909                         error = msgreq.lmr_errno;
 910
 911                 if (!error) {
 912                         /* record that NFS file locking has worked on this mount */
 913                         nmp = VFSTONFS(vnode_mount(vp));
 914                         if (nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK))
 915                                 nmp->nm_state |= NFSSTA_LOCKSWORK;
 916                         /*
 917                          * If we successfully acquired a lock, make sure this pid
 918                          * is in the nfs_lock_pid hash table so we know we can't
 919                          * short-circuit unlock requests.
 920                          */
 921                         if ((lockpidcheck == ENOENT) &&
 922                             ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW)))
 923                                 nfs_lock_pid_check(p, 1, vp);
 924
 925                 }
 926                 break;
 927         }
 928
 929         nfs_lockdmsg_dequeue(&msgreq);
 930
 931         error1 = VNOP_CLOSE(wvp, FWRITE, ap->a_context);
 932         vnode_rele(wvp);
 933         vnode_put(wvp);
 934         /* prefer any previous 'error' to our vn_close 'error1'. */
 935         return (error != 0 ? error : error1);
 936 }
 937
 938 /*
 939  * nfslockdans --
 940  *      NFS advisory byte-level locks answer from the lock daemon.
 941  */
 942 int
 943 nfslockdans(proc_t p, struct lockd_ans *ansp)
 944 {
 945         LOCKD_MSG_REQUEST *msgreq;
 946         int error;
 947
 948         /* Let root make this call. */
 949         error = proc_suser(p);
 950         if (error)
 951                 return (error);
 952
 953         /* the version should match, or we're out of sync */
 954         if (ansp->la_version != LOCKD_ANS_VERSION)
 955                 return (EINVAL);
 956
 957         /* try to find the lockd message by transaction id (cookie) */
 958         msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
 959         if (ansp->la_flags & LOCKD_ANS_GRANTED) {
 960                 /*
 961                  * We can't depend on the granted message having our cookie,
 962                  * so we check the answer against the lockd message found.
 963                  * If no message was found or it doesn't match the answer,
 964                  * we look for the lockd message by the answer's lock info.
 965                  */
 966                 if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp))
 967                         msgreq = nfs_lockdmsg_find_by_answer(ansp);
 968                 /*
 969                  * We need to make sure this request isn't being cancelled
 970                  * If it is, we don't want to accept the granted message.
 971                  */
 972                 if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL))
 973                         msgreq = NULL;
 974         }
 975         if (!msgreq)
 976                 return (EPIPE);
 977
 978         msgreq->lmr_errno = ansp->la_errno;
 979         if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) {
 980                 if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) {
 981                         if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL)
 982                                 msgreq->lmr_msg.lm_fl.l_type = F_WRLCK;
 983                         else
 984                                 msgreq->lmr_msg.lm_fl.l_type = F_RDLCK;
 985                         msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid;
 986                         msgreq->lmr_msg.lm_fl.l_start = ansp->la_start;
 987                         msgreq->lmr_msg.lm_fl.l_len = ansp->la_len;
 988                 } else {
 989                         msgreq->lmr_msg.lm_fl.l_type = F_UNLCK;
 990                 }
 991         }
 992
 993         msgreq->lmr_answered = 1;
 994         (void)wakeup((void *)msgreq);
 995
 996         return (0);
 997 }
 998
 999 /*
1000  * nfslockdfd --
1001  *      NFS advisory byte-level locks: fifo file# from the lock daemon.
1002  */
1003 int
1004 nfslockdfd(proc_t p, int fd)
1005 {
1006         int error;
1007         vnode_t vp, oldvp;
1008
1009         error = proc_suser(p);
1010         if (error)
1011                 return (error);
1012         if (fd < 0) {
1013                 vp = NULL;
1014         } else {
1015                 error = file_vnode(fd, &vp);
1016                 if (error)
1017                         return (error);
1018                 error = vnode_getwithref(vp);
1019                 if (error)
1020                         return (error);
1021                 error = vnode_ref(vp);
1022                 if (error) {
1023                         vnode_put(vp);
1024                         return (error);
1025                 }
1026         }
1027         oldvp = nfslockdvnode;
1028         nfslockdvnode = vp;
1029         if (oldvp) {
1030                 vnode_rele(oldvp);
1031         }
1032         (void)wakeup((void *)&nfslockdvnode);
1033         if (vp) {
1034                 vnode_put(vp);
1035         }
1036         return (0);
1037 }
1038
1039 /*
1040  * nfslockdwait --
1041  *      lock daemon waiting for lock request
1042  */
1043 int
1044 nfslockdwait(proc_t p)
1045 {
1046         int error;
1047
1048         error = proc_suser(p);
1049         if (error)
1050                 return (error);
1051         if (nfslockdwaiting || nfslockdvnode)
1052                 return (EBUSY);
1053
1054         nfslockdstarttimeout = 0;
1055         nfslockdwaiting = 1;
1056         tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0);
1057         nfslockdwaiting = 0;
1058
1059         return (0);
1060 }