bsd/nfs/nfs_lock.c

   1 /*
   2  * Copyright (c) 2002-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /*-
  31  * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  42  *    promote products derived from this software without specific prior
  43  *    written permission.
  44  *
  45  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  48  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  55  * SUCH DAMAGE.
  56  *
  57  *      from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
  58  */
  59
  60 #include <sys/cdefs.h>
  61 #include <sys/param.h>
  62 #include <sys/systm.h>
  63 #include <sys/fcntl.h>
  64 #include <sys/kernel.h>         /* for hz */
  65 #include <sys/file_internal.h>
  66 #include <sys/malloc.h>
  67 #include <sys/lockf.h>          /* for hz */ /* Must come after sys/malloc.h */
  68 #include <sys/kpi_mbuf.h>
  69 #include <sys/mount_internal.h>
  70 #include <sys/proc_internal.h>  /* for p_start */
  71 #include <sys/kauth.h>
  72 #include <sys/resourcevar.h>
  73 #include <sys/socket.h>
  74 #include <sys/unistd.h>
  75 #include <sys/user.h>
  76 #include <sys/vnode_internal.h>
  77
  78 #include <kern/thread.h>
  79
  80 #include <machine/limits.h>
  81
  82 #include <net/if.h>
  83
  84 #include <nfs/rpcv2.h>
  85 #include <nfs/nfsproto.h>
  86 #include <nfs/nfs.h>
  87 #include <nfs/nfsmount.h>
  88 #include <nfs/nfsnode.h>
  89 #include <nfs/nfs_lock.h>
  90
  91 #define OFF_MAX QUAD_MAX
  92
  93 /*
  94  * globals for managing the lockd fifo
  95  */
  96 vnode_t nfslockdvnode = 0;
  97 int nfslockdwaiting = 0;
  98 time_t nfslockdstarttimeout = 0;
  99 int nfslockdfifolock = 0;
 100 #define NFSLOCKDFIFOLOCK_LOCKED 1
 101 #define NFSLOCKDFIFOLOCK_WANT   2
 102
 103 /*
 104  * pending lock request messages are kept in this queue which is
 105  * kept sorted by transaction ID (xid).
 106  */
 107 uint64_t nfs_lockxid = 0;
 108 LOCKD_MSG_QUEUE nfs_pendlockq;
 109
 110 /*
 111  * This structure is used to identify processes which have acquired NFS locks.
 112  * Knowing which processes have ever acquired locks allows us to short-circuit
 113  * unlock requests for processes that have never had an NFS file lock.  Thus
 114  * avoiding a costly and unnecessary lockd request.
 115  */
 116 struct nfs_lock_pid {
 117         TAILQ_ENTRY(nfs_lock_pid)       lp_lru;         /* LRU list */
 118         LIST_ENTRY(nfs_lock_pid)        lp_hash;        /* hash chain */
 119         int                             lp_valid;       /* valid entry? */
 120         int                             lp_time;        /* last time seen valid */
 121         pid_t                           lp_pid;         /* The process ID. */
 122         struct timeval                  lp_pid_start;   /* Start time of process id */
 123 };
 124
 125 #define NFS_LOCK_PID_HASH_SIZE          64      // XXX tune me
 126 #define NFS_LOCK_PID_HASH(pid)  \
 127         (&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash])
 128 LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl;
 129 TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru;
 130 u_long nfs_lock_pid_hash;
 131 int nfs_lock_pid_lock;
 132
 133
 134 /*
 135  * initialize global nfs lock state
 136  */
 137 void
 138 nfs_lockinit(void)
 139 {
 140         TAILQ_INIT(&nfs_pendlockq);
 141         nfs_lock_pid_lock = 0;
 142         nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE,
 143                                          M_TEMP, &nfs_lock_pid_hash);
 144         TAILQ_INIT(&nfs_lock_pid_lru);
 145 }
 146
 147 /*
 148  * insert a lock request message into the pending queue
 149  */
 150 static inline void
 151 nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq)
 152 {
 153         LOCKD_MSG_REQUEST *mr;
 154
 155         mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue);
 156         if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 157                 /* fast path: empty queue or new largest xid */
 158                 TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next);
 159                 return;
 160         }
 161         /* slow path: need to walk list to find insertion point */
 162         while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
 163                 mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next);
 164         }
 165         if (mr) {
 166                 TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next);
 167         } else {
 168                 TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next);
 169         }
 170 }
 171
 172 /*
 173  * remove a lock request message from the pending queue
 174  */
 175 static inline void
 176 nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq)
 177 {
 178         TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next);
 179 }
 180
 181 /*
 182  * find a pending lock request message by xid
 183  *
 184  * We search from the head of the list assuming that the message we're
 185  * looking for is for an older request (because we have an answer to it).
 186  * This assumes that lock request will be answered primarily in FIFO order.
 187  * However, this may not be the case if there are blocked requests.  We may
 188  * want to move blocked requests to a separate queue (but that'll complicate
 189  * duplicate xid checking).
 190  */
 191 static inline LOCKD_MSG_REQUEST *
 192 nfs_lockdmsg_find_by_xid(uint64_t lockxid)
 193 {
 194         LOCKD_MSG_REQUEST *mr;
 195
 196         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 197                 if (mr->lmr_msg.lm_xid == lockxid)
 198                         return mr;
 199                 if (mr->lmr_msg.lm_xid > lockxid)
 200                         return NULL;
 201         }
 202         return mr;
 203 }
 204
 205 /*
 206  * Because we can't depend on nlm_granted messages containing the same
 207  * cookie we sent with the original lock request, we need code test if
 208  * an nlm_granted answer matches the lock request.  We also need code
 209  * that can find a lockd message based solely on the nlm_granted answer.
 210  */
 211
 212 /*
 213  * compare lockd message to answer
 214  *
 215  * returns 0 on equality and 1 if different
 216  */
 217 static inline int
 218 nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp)
 219 {
 220         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 221                 return 1;
 222         if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid)
 223                 return 1;
 224         if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start)
 225                 return 1;
 226         if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len)
 227                 return 1;
 228         if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len)
 229                 return 1;
 230         if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len))
 231                 return 1;
 232         return 0;
 233 }
 234
 235 /*
 236  * find a pending lock request message based on the lock info provided
 237  * in the lockd_ans/nlm_granted data.  We need this because we can't
 238  * depend on nlm_granted messages containing the same cookie we sent
 239  * with the original lock request.
 240  *
 241  * We search from the head of the list assuming that the message we're
 242  * looking for is for an older request (because we have an answer to it).
 243  * This assumes that lock request will be answered primarily in FIFO order.
 244  * However, this may not be the case if there are blocked requests.  We may
 245  * want to move blocked requests to a separate queue (but that'll complicate
 246  * duplicate xid checking).
 247  */
 248 static inline LOCKD_MSG_REQUEST *
 249 nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp)
 250 {
 251         LOCKD_MSG_REQUEST *mr;
 252
 253         if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
 254                 return NULL;
 255         TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
 256                 if (!nfs_lockdmsg_compare_to_answer(mr, ansp))
 257                         break;
 258         }
 259         return mr;
 260 }
 261
 262 /*
 263  * return the next unique lock request transaction ID
 264  */
 265 static inline uint64_t
 266 nfs_lockxid_get(void)
 267 {
 268         LOCKD_MSG_REQUEST *mr;
 269
 270         /* derive initial lock xid from system time */
 271         if (!nfs_lockxid) {
 272                 /*
 273                  * Note: it's OK if this code inits nfs_lockxid to 0 (for example,
 274                  * due to a broken clock) because we immediately increment it
 275                  * and we guarantee to never use xid 0.  So, nfs_lockxid should only
 276                  * ever be 0 the first time this function is called.
 277                  */
 278                 struct timeval tv;
 279                 microtime(&tv);
 280                 nfs_lockxid = (uint64_t)tv.tv_sec << 12;
 281         }
 282
 283         /* make sure we get a unique xid */
 284         do {
 285                 /* Skip zero xid if it should ever happen.  */
 286                 if (++nfs_lockxid == 0)
 287                         nfs_lockxid++;
 288                 if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) ||
 289                      (mr->lmr_msg.lm_xid < nfs_lockxid)) {
 290                         /* fast path: empty queue or new largest xid */
 291                         break;
 292                 }
 293                 /* check if xid is already in use */
 294         } while (nfs_lockdmsg_find_by_xid(nfs_lockxid));
 295
 296         return nfs_lockxid;
 297 }
 298
 299
 300 /*
 301  * Check the nfs_lock_pid hash table for an entry and, if requested,
 302  * add the entry if it is not found.
 303  *
 304  * (Also, if adding, try to clean up some stale entries.)
 305  */
 306 static int
 307 nfs_lock_pid_check(proc_t p, int addflag, vnode_t vp)
 308 {
 309         struct nfs_lock_pid *lp, *lplru, *lplru_next;
 310         proc_t plru;
 311         int error = 0;
 312         struct timeval now;
 313
 314         /* lock hash */
 315 loop:
 316         if (nfs_lock_pid_lock) {
 317                 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
 318                 while (nfs_lock_pid_lock) {
 319                         nfs_lock_pid_lock = -1;
 320                         tsleep(&nfs_lock_pid_lock, PCATCH, "nfslockpid", 0);
 321                         if ((error = nfs_sigintr(nmp, NULL, p)))
 322                                 return (error);
 323                 }
 324                 goto loop;
 325         }
 326         nfs_lock_pid_lock = 1;
 327
 328         /* Search hash chain */
 329         error = ENOENT;
 330         lp = NFS_LOCK_PID_HASH(proc_pid(p))->lh_first;
 331         for (; lp != NULL; lp = lp->lp_hash.le_next)
 332                 if (lp->lp_pid == proc_pid(p)) {
 333                         /* found pid... */
 334                         if (timevalcmp(&lp->lp_pid_start, &p->p_stats->p_start, ==)) {
 335                                 /* ...and it's valid */
 336                                 /* move to tail of LRU */
 337                                 TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 338                                 microuptime(&now);
 339                                 lp->lp_time = now.tv_sec;
 340                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 341                                 error = 0;
 342                                 break;
 343                         }
 344                         /* ...but it's no longer valid */
 345                         /* remove from hash, invalidate, and move to lru head */
 346                         LIST_REMOVE(lp, lp_hash);
 347                         lp->lp_valid = 0;
 348                         TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
 349                         TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru);
 350                         lp = NULL;
 351                         break;
 352                 }
 353
 354         /* if we didn't find it (valid) and we've been asked to add it */
 355         if ((error == ENOENT) && addflag) {
 356                 /* scan lru list for invalid, stale entries to reuse/free */
 357                 int lrucnt = 0;
 358                 microuptime(&now);
 359                 for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) {
 360                         lplru_next = TAILQ_NEXT(lplru, lp_lru);
 361                         if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) {
 362                                 /*
 363                                  * If the oldest LRU entry is relatively new, then don't
 364                                  * bother scanning any further.
 365                                  */
 366                                 break;
 367                         }
 368                         /* remove entry from LRU, and check if it's still in use */
 369                         TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru);
 370                         if (!lplru->lp_valid || !(plru = pfind(lplru->lp_pid)) ||
 371                             timevalcmp(&lplru->lp_pid_start, &plru->p_stats->p_start, !=)) {
 372                                 /* no longer in use */
 373                                 LIST_REMOVE(lplru, lp_hash);
 374                                 if (!lp) {
 375                                         /* we'll reuse this one */
 376                                         lp = lplru;
 377                                 } else {
 378                                         /* we can free this one */
 379                                         FREE(lplru, M_TEMP);
 380                                 }
 381                         } else {
 382                                 /* still in use */
 383                                 lplru->lp_time = now.tv_sec;
 384                                 TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru);
 385                         }
 386                         /* don't check too many entries at once */
 387                         if (++lrucnt > 8)
 388                                 break;
 389                 }
 390                 if (!lp) {
 391                         /* we need to allocate a new one */
 392                         MALLOC(lp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid),
 393                                 M_TEMP, M_WAITOK | M_ZERO);
 394                 }
 395                 if (!lp) {
 396                         error = ENOMEM;
 397                 } else {
 398                         /* (re)initialize nfs_lock_pid info */
 399                         lp->lp_pid = proc_pid(p);
 400                         lp->lp_pid_start = p->p_stats->p_start;
 401                         /* insert pid in hash */
 402                         LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash);
 403                         lp->lp_valid = 1;
 404                         lp->lp_time = now.tv_sec;
 405                         TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
 406                         error = 0;
 407                 }
 408         }
 409
 410         /* unlock hash */
 411         if (nfs_lock_pid_lock < 0) {
 412                 nfs_lock_pid_lock = 0;
 413                 wakeup(&nfs_lock_pid_lock);
 414         } else
 415                 nfs_lock_pid_lock = 0;
 416
 417         return (error);
 418 }
 419
 420
 421 /*
 422  * nfs_advlock --
 423  *      NFS advisory byte-level locks.
 424  */
 425 int
 426 nfs_dolock(struct vnop_advlock_args *ap)
 427 /* struct vnop_advlock_args {
 428         struct vnodeop_desc *a_desc;
 429         vnode_t a_vp;
 430         caddr_t a_id;
 431         int a_op;
 432         struct flock *a_fl;
 433         int a_flags;
 434         vfs_context_t a_context;
 435 }; */
 436 {
 437         LOCKD_MSG_REQUEST msgreq;
 438         LOCKD_MSG *msg;
 439         vnode_t vp, wvp;
 440         struct nfsnode *np;
 441         int error, error1;
 442         struct flock *fl;
 443         int fmode, ioflg;
 444         struct nfsmount *nmp;
 445         struct nfs_vattr nvattr;
 446         off_t start, end;
 447         struct timeval now;
 448         int timeo, endtime, lastmsg, wentdown = 0;
 449         int lockpidcheck;
 450         kauth_cred_t cred;
 451         proc_t p;
 452         struct sockaddr *saddr;
 453
 454         p = vfs_context_proc(ap->a_context);
 455         cred = vfs_context_ucred(ap->a_context);
 456
 457         vp = ap->a_vp;
 458         fl = ap->a_fl;
 459         np = VTONFS(vp);
 460
 461         nmp = VFSTONFS(vnode_mount(vp));
 462         if (!nmp)
 463                 return (ENXIO);
 464         if (nmp->nm_flag & NFSMNT_NOLOCKS)
 465                 return (ENOTSUP);
 466
 467         /*
 468          * The NLM protocol doesn't allow the server to return an error
 469          * on ranges, so we do it.  Pre LFS (Large File Summit)
 470          * standards required EINVAL for the range errors.  More recent
 471          * standards use EOVERFLOW, but their EINVAL wording still
 472          * encompasses these errors.
 473          * Any code sensitive to this is either:
 474          *  1) written pre-LFS and so can handle only EINVAL, or
 475          *  2) written post-LFS and thus ought to be tolerant of pre-LFS
 476          *     implementations.
 477          * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
 478          */
 479         if (fl->l_whence != SEEK_END) {
 480                 if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
 481                     fl->l_start < 0 ||
 482                     (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
 483                     (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
 484                         return (EINVAL);
 485         }
 486         /*
 487          * If daemon is running take a ref on its fifo vnode
 488          */
 489         if (!(wvp = nfslockdvnode)) {
 490                 if (!nfslockdwaiting && !nfslockdstarttimeout)
 491                         return (ENOTSUP);
 492                 /*
 493                  * Don't wake lock daemon if it hasn't been started yet and
 494                  * this is an unlock request (since we couldn't possibly
 495                  * actually have a lock on the file).  This could be an
 496                  * uninformed unlock request due to closef()'s behavior of doing
 497                  * unlocks on all files if a process has had a lock on ANY file.
 498                  */
 499                 if (!nfslockdvnode && (fl->l_type == F_UNLCK))
 500                         return (EINVAL);
 501                 microuptime(&now);
 502                 if (nfslockdwaiting) {
 503                         /* wake up lock daemon */
 504                         nfslockdstarttimeout = now.tv_sec + 60;
 505                         (void)wakeup((void *)&nfslockdwaiting);
 506                 }
 507                 /* wait on nfslockdvnode for a while to allow daemon to start */
 508                 while (!nfslockdvnode && (now.tv_sec < nfslockdstarttimeout)) {
 509                         error = tsleep((void *)&nfslockdvnode, PCATCH | PUSER, "lockdstart", 2*hz);
 510                         if (error && (error != EWOULDBLOCK))
 511                                 return (error);
 512                         /* check that we still have our mount... */
 513                         /* ...and that we still support locks */
 514                         nmp = VFSTONFS(vnode_mount(vp));
 515                         if (!nmp)
 516                                 return (ENXIO);
 517                         if (nmp->nm_flag & NFSMNT_NOLOCKS)
 518                                 return (ENOTSUP);
 519                         if (!error)
 520                                 break;
 521                         microuptime(&now);
 522                 }
 523                 /*
 524                  * check for nfslockdvnode
 525                  * If it hasn't started by now, there's a problem.
 526                  */
 527                 if (!(wvp = nfslockdvnode))
 528                         return (ENOTSUP);
 529         }
 530         error = vnode_getwithref(wvp);
 531         if (error)
 532                 return (ENOTSUP);
 533         error = vnode_ref(wvp);
 534         if (error) {
 535                 vnode_put(wvp);
 536                 return (ENOTSUP);
 537         }
 538
 539         /*
 540          * Need to check if this process has successfully acquired an NFS lock before.
 541          * If not, and this is an unlock request we can simply return success here.
 542          */
 543         lockpidcheck = nfs_lock_pid_check(p, 0, vp);
 544         if (lockpidcheck) {
 545                 if (lockpidcheck != ENOENT) {
 546                         vnode_rele(wvp);
 547                         vnode_put(wvp);
 548                         return (lockpidcheck);
 549                 }
 550                 if (ap->a_op == F_UNLCK) {
 551                         vnode_rele(wvp);
 552                         vnode_put(wvp);
 553                         return (0);
 554                 }
 555         }
 556
 557         /*
 558          * The NFS Lock Manager protocol doesn't directly handle
 559          * negative lengths or SEEK_END, so we need to normalize
 560          * things here where we have all the info.
 561          * (Note: SEEK_CUR is already adjusted for at this point)
 562          */
 563         /* Convert the flock structure into a start and end. */
 564         switch (fl->l_whence) {
 565         case SEEK_SET:
 566         case SEEK_CUR:
 567                 /*
 568                  * Caller is responsible for adding any necessary offset
 569                  * to fl->l_start when SEEK_CUR is used.
 570                  */
 571                 start = fl->l_start;
 572                 break;
 573         case SEEK_END:
 574                 /* need to flush, and refetch attributes to make */
 575                 /* sure we have the correct end of file offset   */
 576                 if (np->n_flag & NMODIFIED) {
 577                         NATTRINVALIDATE(np);
 578                         error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
 579                         if (error) {
 580                                 vnode_rele(wvp);
 581                                 vnode_put(wvp);
 582                                 return (error);
 583                         }
 584                 }
 585                 NATTRINVALIDATE(np);
 586
 587                 error = nfs_getattr(vp, &nvattr, cred, p);
 588                 if (error) {
 589                         vnode_rele(wvp);
 590                         vnode_put(wvp);
 591                         return (error);
 592                 }
 593                 start = np->n_size + fl->l_start;
 594                 break;
 595         default:
 596                 vnode_rele(wvp);
 597                 vnode_put(wvp);
 598                 return (EINVAL);
 599         }
 600         if (fl->l_len == 0)
 601                 end = -1;
 602         else if (fl->l_len > 0)
 603                 end = start + fl->l_len - 1;
 604         else { /* l_len is negative */
 605                 end = start - 1;
 606                 start += fl->l_len;
 607         }
 608         if (start < 0) {
 609                 vnode_rele(wvp);
 610                 vnode_put(wvp);
 611                 return (EINVAL);
 612         }
 613         if (!NFS_ISV3(vp) &&
 614             ((start >= 0x80000000) || (end >= 0x80000000))) {
 615                 vnode_rele(wvp);
 616                 vnode_put(wvp);
 617                 return (EINVAL);
 618         }
 619
 620         /*
 621          * Fill in the information structure.
 622          */
 623         msgreq.lmr_answered = 0;
 624         msgreq.lmr_errno = 0;
 625         msgreq.lmr_saved_errno = 0;
 626         msg = &msgreq.lmr_msg;
 627         msg->lm_version = LOCKD_MSG_VERSION;
 628         msg->lm_flags = 0;
 629
 630         msg->lm_fl = *fl;
 631         msg->lm_fl.l_start = start;
 632         if (end != -1)
 633                 msg->lm_fl.l_len = end - start + 1;
 634         msg->lm_fl.l_pid = proc_pid(p);
 635
 636         if (ap->a_flags & F_WAIT)
 637                 msg->lm_flags |= LOCKD_MSG_BLOCK;
 638         if (ap->a_op == F_GETLK)
 639                 msg->lm_flags |= LOCKD_MSG_TEST;
 640
 641         nmp = VFSTONFS(vnode_mount(vp));
 642         if (!nmp) {
 643                 vnode_rele(wvp);
 644                 vnode_put(wvp);
 645                 return (ENXIO);
 646         }
 647
 648         saddr = mbuf_data(nmp->nm_nam);
 649         bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len));
 650         msg->lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
 651         bcopy(VTONFS(vp)->n_fhp, msg->lm_fh, msg->lm_fh_len);
 652         if (NFS_ISV3(vp))
 653                 msg->lm_flags |= LOCKD_MSG_NFSV3;
 654         cru2x(cred, &msg->lm_cred);
 655
 656         microuptime(&now);
 657         lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
 658
 659         fmode = FFLAGS(O_WRONLY);
 660         if ((error = VNOP_OPEN(wvp, fmode, ap->a_context))) {
 661                 vnode_rele(wvp);
 662                 vnode_put(wvp);
 663                 return (error);
 664         }
 665         vnode_lock(wvp);
 666         ++wvp->v_writecount;
 667         vnode_unlock(wvp);
 668
 669         /* allocate unique xid */
 670         msg->lm_xid = nfs_lockxid_get();
 671         nfs_lockdmsg_enqueue(&msgreq);
 672
 673         timeo = 2*hz;
 674 #define IO_NOMACCHECK 0;
 675         ioflg = IO_UNIT | IO_NOMACCHECK;
 676         for (;;) {
 677                 error = 0;
 678                 while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) {
 679                         nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT;
 680                         error = tsleep((void *)&nfslockdfifolock,
 681                                         PCATCH | PUSER, "lockdfifo", 20*hz);
 682                         if (error)
 683                                 break;
 684                 }
 685                 if (error)
 686                         break;
 687                 nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED;
 688
 689                 error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)msg, sizeof(*msg), 0,
 690                     UIO_SYSSPACE32, ioflg, proc_ucred(kernproc), NULL, p);
 691
 692                 nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED;
 693                 if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) {
 694                         nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT;
 695                         wakeup((void *)&nfslockdfifolock);
 696                 }
 697
 698                 if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) {
 699                         break;
 700                 }
 701
 702                 /*
 703                  * Always wait for an answer.  Not waiting for unlocks could
 704                  * cause a lock to be left if the unlock request gets dropped.
 705                  */
 706
 707                 /*
 708                  * Retry if it takes too long to get a response.
 709                  *
 710                  * The timeout numbers were picked out of thin air... they start
 711                  * at 2 and double each timeout with a max of 60 seconds.
 712                  *
 713                  * In order to maintain responsiveness, we pass a small timeout
 714                  * to tsleep and calculate the timeouts ourselves.  This allows
 715                  * us to pick up on mount changes quicker.
 716                  */
 717 wait_for_granted:
 718                 error = EWOULDBLOCK;
 719                 microuptime(&now);
 720                 if ((timeo/hz) > 0)
 721                         endtime = now.tv_sec + timeo/hz;
 722                 else
 723                         endtime = now.tv_sec + 1;
 724                 while (now.tv_sec < endtime) {
 725                         error = tsleep((void *)&msgreq, PCATCH | PUSER, "lockd", 2*hz);
 726                         if (msgreq.lmr_answered) {
 727                                 /*
 728                                  * Note: it's possible to have a lock granted at
 729                                  * essentially the same time that we get interrupted.
 730                                  * Since the lock may be granted, we can't return an
 731                                  * error from this request or we might not unlock the
 732                                  * lock that's been granted.
 733                                  */
 734                                 error = 0;
 735                                 break;
 736                         }
 737                         if (error != EWOULDBLOCK)
 738                                 break;
 739                         /* check that we still have our mount... */
 740                         /* ...and that we still support locks */
 741                         nmp = VFSTONFS(vnode_mount(vp));
 742                         if (!nmp || (nmp->nm_flag & NFSMNT_NOLOCKS))
 743                                 break;
 744                         /*
 745                          * If the mount is hung and we've requested not to hang
 746                          * on remote filesystems, then bail now.
 747                          */
 748                         if ((p != NULL) && ((proc_noremotehang(p)) != 0) &&
 749                             ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
 750                                 if (fl->l_type == F_UNLCK)
 751                                         printf("nfs_dolock: aborting unlock request "
 752                                             "due to timeout (noremotehang)\n");
 753                                 error = EIO;
 754                                 break;
 755                         }
 756                         microuptime(&now);
 757                 }
 758                 if (error) {
 759                         /* check that we still have our mount... */
 760                         nmp = VFSTONFS(vnode_mount(vp));
 761                         if (!nmp) {
 762                                 if (error == EWOULDBLOCK)
 763                                         error = ENXIO;
 764                                 break;
 765                         }
 766                         /* ...and that we still support locks */
 767                         if (nmp->nm_flag & NFSMNT_NOLOCKS) {
 768                                 if (error == EWOULDBLOCK)
 769                                         error = ENOTSUP;
 770                                 break;
 771                         }
 772                         if ((error == ENOTSUP) &&
 773                             (nmp->nm_state & NFSSTA_LOCKSWORK)) {
 774                                 /*
 775                                  * We have evidence that locks work, yet lockd
 776                                  * returned ENOTSUP.  This is probably because
 777                                  * it was unable to contact the server's lockd to
 778                                  * send it the request.
 779                                  *
 780                                  * Because we know locks work, we'll consider
 781                                  * this failure to be a timeout.
 782                                  */
 783                                 error = EWOULDBLOCK;
 784                         }
 785                         if (error != EWOULDBLOCK) {
 786                                 /*
 787                                  * We're going to bail on this request.
 788                                  * If we were a blocked lock request, send a cancel.
 789                                  */
 790                                 if ((msgreq.lmr_errno == EINPROGRESS) &&
 791                                     !(msg->lm_flags & LOCKD_MSG_CANCEL)) {
 792                                         /* set this request up as a cancel */
 793                                         msg->lm_flags |= LOCKD_MSG_CANCEL;
 794                                         nfs_lockdmsg_dequeue(&msgreq);
 795                                         msg->lm_xid = nfs_lockxid_get();
 796                                         nfs_lockdmsg_enqueue(&msgreq);
 797                                         msgreq.lmr_saved_errno = error;
 798                                         msgreq.lmr_errno = 0;
 799                                         msgreq.lmr_answered = 0;
 800                                         /* reset timeout */
 801                                         timeo = 2*hz;
 802                                         /* send cancel request */
 803                                         continue;
 804                                 }
 805                                 break;
 806                         }
 807
 808                         /*
 809                          * If the mount is hung and we've requested not to hang
 810                          * on remote filesystems, then bail now.
 811                          */
 812                         if ((p != NULL) && ((proc_noremotehang(p)) != 0) &&
 813                             ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_LOCKTIMEO)) != 0)) {
 814                                 if (fl->l_type == F_UNLCK)
 815                                         printf("nfs_dolock: aborting unlock request "
 816                                             "due to timeout (noremotehang)\n");
 817                                 error = EIO;
 818                                 break;
 819                         }
 820                         /* warn if we're not getting any response */
 821                         microuptime(&now);
 822                         if ((msgreq.lmr_errno != EINPROGRESS) &&
 823                             (nmp->nm_tprintf_initial_delay != 0) &&
 824                             ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
 825                                 lastmsg = now.tv_sec;
 826                                 nfs_down(nmp, p, 0, NFSSTA_LOCKTIMEO, "lockd not responding");
 827                                 wentdown = 1;
 828                         }
 829                         if (msgreq.lmr_errno == EINPROGRESS) {
 830                                 /*
 831                                  * We've got a blocked lock request that we are
 832                                  * going to retry.  First, we'll want to try to
 833                                  * send a cancel for the previous request.
 834                                  *
 835                                  * Clear errno so if we don't get a response
 836                                  * to the resend we'll call nfs_down().
 837                                  * Also reset timeout because we'll expect a
 838                                  * quick response to the cancel/resend (even if
 839                                  * it is NLM_BLOCKED).
 840                                  */
 841                                 msg->lm_flags |= LOCKD_MSG_CANCEL;
 842                                 nfs_lockdmsg_dequeue(&msgreq);
 843                                 msg->lm_xid = nfs_lockxid_get();
 844                                 nfs_lockdmsg_enqueue(&msgreq);
 845                                 msgreq.lmr_saved_errno = msgreq.lmr_errno;
 846                                 msgreq.lmr_errno = 0;
 847                                 msgreq.lmr_answered = 0;
 848                                 timeo = 2*hz;
 849                                 /* send cancel then resend request */
 850                                 continue;
 851                         }
 852                         /*
 853                          * We timed out, so we will rewrite the request
 854                          * to the fifo, but only if it isn't already full.
 855                          */
 856                         ioflg |= IO_NDELAY;
 857                         timeo *= 2;
 858                         if (timeo > 60*hz)
 859                                 timeo = 60*hz;
 860                         /* resend request */
 861                         continue;
 862                 }
 863
 864                 /* we got a reponse, so the server's lockd is OK */
 865                 nfs_up(VFSTONFS(vnode_mount(vp)), p, NFSSTA_LOCKTIMEO,
 866                         wentdown ? "lockd alive again" : NULL);
 867                 wentdown = 0;
 868
 869                 if (msgreq.lmr_errno == EINPROGRESS) {
 870                         /* got NLM_BLOCKED response */
 871                         /* need to wait for NLM_GRANTED */
 872                         timeo = 60*hz;
 873                         msgreq.lmr_answered = 0;
 874                         goto wait_for_granted;
 875                 }
 876
 877                 if ((msg->lm_flags & LOCKD_MSG_CANCEL) &&
 878                     (msgreq.lmr_saved_errno == EINPROGRESS)) {
 879                         /*
 880                          * We just got a successful reply to the
 881                          * cancel of the previous blocked lock request.
 882                          * Now, go ahead and resend the request.
 883                          */
 884                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 885                         nfs_lockdmsg_dequeue(&msgreq);
 886                         msg->lm_xid = nfs_lockxid_get();
 887                         nfs_lockdmsg_enqueue(&msgreq);
 888                         msgreq.lmr_saved_errno = 0;
 889                         msgreq.lmr_errno = 0;
 890                         msgreq.lmr_answered = 0;
 891                         timeo = 2*hz;
 892                         /* resend request */
 893                         continue;
 894                 }
 895
 896                 if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) {
 897                         if (msg->lm_fl.l_type != F_UNLCK) {
 898                                 fl->l_type = msg->lm_fl.l_type;
 899                                 fl->l_pid = msg->lm_fl.l_pid;
 900                                 fl->l_start = msg->lm_fl.l_start;
 901                                 fl->l_len = msg->lm_fl.l_len;
 902                                 fl->l_whence = SEEK_SET;
 903                         } else {
 904                                 fl->l_type = F_UNLCK;
 905                         }
 906                 }
 907
 908                 /*
 909                  * If the blocked lock request was cancelled.
 910                  * Restore the error condition from when we
 911                  * originally bailed on the request.
 912                  */
 913                 if (msg->lm_flags & LOCKD_MSG_CANCEL) {
 914                         msg->lm_flags &= ~LOCKD_MSG_CANCEL;
 915                         error = msgreq.lmr_saved_errno;
 916                 } else
 917                         error = msgreq.lmr_errno;
 918
 919                 if (!error) {
 920                         /* record that NFS file locking has worked on this mount */
 921                         nmp = VFSTONFS(vnode_mount(vp));
 922                         if (nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK))
 923                                 nmp->nm_state |= NFSSTA_LOCKSWORK;
 924                         /*
 925                          * If we successfully acquired a lock, make sure this pid
 926                          * is in the nfs_lock_pid hash table so we know we can't
 927                          * short-circuit unlock requests.
 928                          */
 929                         if ((lockpidcheck == ENOENT) &&
 930                             ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW)))
 931                                 nfs_lock_pid_check(p, 1, vp);
 932
 933                 }
 934                 break;
 935         }
 936
 937         nfs_lockdmsg_dequeue(&msgreq);
 938
 939         error1 = VNOP_CLOSE(wvp, FWRITE, ap->a_context);
 940         vnode_rele(wvp);
 941         vnode_put(wvp);
 942         /* prefer any previous 'error' to our vn_close 'error1'. */
 943         return (error != 0 ? error : error1);
 944 }
 945
 946 /*
 947  * nfslockdans --
 948  *      NFS advisory byte-level locks answer from the lock daemon.
 949  */
 950 int
 951 nfslockdans(proc_t p, struct lockd_ans *ansp)
 952 {
 953         LOCKD_MSG_REQUEST *msgreq;
 954         int error;
 955
 956         /* Let root make this call. */
 957         error = proc_suser(p);
 958         if (error)
 959                 return (error);
 960
 961         /* the version should match, or we're out of sync */
 962         if (ansp->la_version != LOCKD_ANS_VERSION)
 963                 return (EINVAL);
 964
 965         /* try to find the lockd message by transaction id (cookie) */
 966         msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
 967         if (ansp->la_flags & LOCKD_ANS_GRANTED) {
 968                 /*
 969                  * We can't depend on the granted message having our cookie,
 970                  * so we check the answer against the lockd message found.
 971                  * If no message was found or it doesn't match the answer,
 972                  * we look for the lockd message by the answer's lock info.
 973                  */
 974                 if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp))
 975                         msgreq = nfs_lockdmsg_find_by_answer(ansp);
 976                 /*
 977                  * We need to make sure this request isn't being cancelled
 978                  * If it is, we don't want to accept the granted message.
 979                  */
 980                 if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL))
 981                         msgreq = NULL;
 982         }
 983         if (!msgreq)
 984                 return (EPIPE);
 985
 986         msgreq->lmr_errno = ansp->la_errno;
 987         if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) {
 988                 if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) {
 989                         if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL)
 990                                 msgreq->lmr_msg.lm_fl.l_type = F_WRLCK;
 991                         else
 992                                 msgreq->lmr_msg.lm_fl.l_type = F_RDLCK;
 993                         msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid;
 994                         msgreq->lmr_msg.lm_fl.l_start = ansp->la_start;
 995                         msgreq->lmr_msg.lm_fl.l_len = ansp->la_len;
 996                 } else {
 997                         msgreq->lmr_msg.lm_fl.l_type = F_UNLCK;
 998                 }
 999         }
1000
1001         msgreq->lmr_answered = 1;
1002         (void)wakeup((void *)msgreq);
1003
1004         return (0);
1005 }
1006
1007 /*
1008  * nfslockdfd --
1009  *      NFS advisory byte-level locks: fifo file# from the lock daemon.
1010  */
1011 int
1012 nfslockdfd(proc_t p, int fd)
1013 {
1014         int error;
1015         vnode_t vp, oldvp;
1016
1017         error = proc_suser(p);
1018         if (error)
1019                 return (error);
1020         if (fd < 0) {
1021                 vp = NULL;
1022         } else {
1023                 error = file_vnode(fd, &vp);
1024                 if (error)
1025                         return (error);
1026                 error = vnode_getwithref(vp);
1027                 if (error)
1028                         return (error);
1029                 error = vnode_ref(vp);
1030                 if (error) {
1031                         vnode_put(vp);
1032                         return (error);
1033                 }
1034         }
1035         oldvp = nfslockdvnode;
1036         nfslockdvnode = vp;
1037         if (oldvp) {
1038                 vnode_rele(oldvp);
1039         }
1040         (void)wakeup((void *)&nfslockdvnode);
1041         if (vp) {
1042                 vnode_put(vp);
1043         }
1044         return (0);
1045 }
1046
1047 /*
1048  * nfslockdwait --
1049  *      lock daemon waiting for lock request
1050  */
1051 int
1052 nfslockdwait(proc_t p)
1053 {
1054         int error;
1055
1056         error = proc_suser(p);
1057         if (error)
1058                 return (error);
1059         if (nfslockdwaiting || nfslockdvnode)
1060                 return (EBUSY);
1061
1062         nfslockdstarttimeout = 0;
1063         nfslockdwaiting = 1;
1064         tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0);
1065         nfslockdwaiting = 0;
1066
1067         return (0);
1068 }