bsd/nfs/nfs_lock.c

   1 /*
   2  * Copyright (c) 2002-2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /*-
  26  * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
  27  *
  28  * Redistribution and use in source and binary forms, with or without
  29  * modification, are permitted provided that the following conditions
  30  * are met:
  31  * 1. Redistributions of source code must retain the above copyright
  32  *    notice, this list of conditions and the following disclaimer.
  33  * 2. Redistributions in binary form must reproduce the above copyright
  34  *    notice, this list of conditions and the following disclaimer in the
  35  *    documentation and/or other materials provided with the distribution.
  36  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  37  *    promote products derived from this software without specific prior
  38  *    written permission.
  39  *
  40  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  43  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  50  * SUCH DAMAGE.
  51  *
  52  *      from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
  53  */
  54
  55 #include <sys/cdefs.h>
  56 #include <sys/param.h>
  57 #include <sys/systm.h>
  58 #include <sys/fcntl.h>
  59 #include <sys/kernel.h>         /* for hz */
  60 #include <sys/file.h>
  61 #include <sys/lock.h>
  62 #include <sys/malloc.h>
  63 #include <sys/lockf.h>          /* for hz */ /* Must come after sys/malloc.h */
  64 #include <sys/mbuf.h>
  65 #include <sys/mount.h>
  66 #include <sys/namei.h>
  67 #include <sys/proc.h>
  68 #include <sys/resourcevar.h>
  69 #include <sys/socket.h>
  70 #include <sys/socket.h>
  71 #include <sys/unistd.h>
  72 #include <sys/user.h>
  73 #include <sys/vnode.h>
  74
  75 #include <kern/thread_act.h>
  76
  77 #include <machine/limits.h>
  78
  79 #include <net/if.h>
  80
  81 #include <nfs/rpcv2.h>
  82 #include <nfs/nfsproto.h>
  83 #include <nfs/nfs.h>
  84 #include <nfs/nfsmount.h>
  85 #include <nfs/nfsnode.h>
  86 #include <nfs/nfs_lock.h>
  87 #include <nfs/nlminfo.h>
  88
  89 #define OFF_MAX QUAD_MAX
  90
  91 uint64_t nfsadvlocks = 0;
  92 struct timeval nfsadvlock_longest = {0, 0};
  93 struct timeval nfsadvlocks_time = {0, 0};
  94
  95 pid_t nfslockdpid = 0;
  96 struct file *nfslockdfp = 0;
  97 int nfslockdwaiting = 0;
  98 int nfslockdfifowritten = 0;
  99 int nfslockdfifolock = 0;
 100 #define NFSLOCKDFIFOLOCK_LOCKED 1
 101 #define NFSLOCKDFIFOLOCK_WANT   2
 102
 103 /*
 104  * XXX
 105  * We have to let the process know if the call succeeded.  I'm using an extra
 106  * field in the uu_nlminfo field in the uthread structure, as it is already for
 107  * lockd stuff.
 108  */
 109
 110 /*
 111  * nfs_advlock --
 112  *      NFS advisory byte-level locks.
 113  */
 114 int
 115 nfs_dolock(struct vop_advlock_args *ap)
 116 /* struct vop_advlock_args {
 117         struct vnodeop_desc *a_desc;
 118         struct vnode *a_vp;
 119         caddr_t a_id;
 120         int a_op;
 121         struct flock *a_fl;
 122         int a_flags;
 123 }; */
 124 {
 125         LOCKD_MSG msg;
 126         struct nameidata nd;
 127         struct vnode *vp, *wvp;
 128         struct nfsnode *np;
 129         int error, error1;
 130         struct flock *fl;
 131         int fmode, ioflg;
 132         struct proc *p;
 133         struct uthread *ut;
 134         struct timeval elapsed;
 135         struct nfsmount *nmp;
 136         struct vattr vattr;
 137         off_t start, end;
 138
 139         ut = get_bsdthread_info(current_act());
 140         p = current_proc();
 141
 142         vp = ap->a_vp;
 143         fl = ap->a_fl;
 144         np = VTONFS(vp);
 145
 146         nmp = VFSTONFS(vp->v_mount);
 147         if (!nmp)
 148                 return (ENXIO);
 149         if (nmp->nm_flag & NFSMNT_NOLOCKS)
 150                 return (EOPNOTSUPP);
 151
 152         /*
 153          * The NLM protocol doesn't allow the server to return an error
 154          * on ranges, so we do it.  Pre LFS (Large File Summit)
 155          * standards required EINVAL for the range errors.  More recent
 156          * standards use EOVERFLOW, but their EINVAL wording still
 157          * encompasses these errors.
 158          * Any code sensitive to this is either:
 159          *  1) written pre-LFS and so can handle only EINVAL, or
 160          *  2) written post-LFS and thus ought to be tolerant of pre-LFS
 161          *     implementations.
 162          * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
 163          */
 164         if (fl->l_whence != SEEK_END) {
 165                 if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
 166                     fl->l_start < 0 ||
 167                     (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
 168                     (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
 169                         return (EINVAL);
 170         }
 171         /*
 172          * If daemon is running take a ref on its fifo
 173          */
 174         if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data)) {
 175                 if (!nfslockdwaiting)
 176                         return (EOPNOTSUPP);
 177                 /*
 178                  * Don't wake lock daemon if it hasn't been started yet and
 179                  * this is an unlock request (since we couldn't possibly
 180                  * actually have a lock on the file).  This could be an
 181                  * uninformed unlock request due to closef()'s behavior of doing
 182                  * unlocks on all files if a process has had a lock on ANY file.
 183                  */
 184                 if (!nfslockdfp && (fl->l_type == F_UNLCK))
 185                         return (EINVAL);
 186                 /* wake up lock daemon */
 187                 (void)wakeup((void *)&nfslockdwaiting);
 188                 /* wait on nfslockdfp for a while to allow daemon to start */
 189                 tsleep((void *)&nfslockdfp, PCATCH | PUSER, "lockd", 60*hz);
 190                 /* check for nfslockdfp and f_data */
 191                 if (!nfslockdfp || !(wvp = (struct vnode *)nfslockdfp->f_data))
 192                         return (EOPNOTSUPP);
 193         }
 194         VREF(wvp);
 195         /*
 196          * if there is no nfsowner table yet, allocate one.
 197          */
 198         if (ut->uu_nlminfo == NULL) {
 199                 if (ap->a_op == F_UNLCK) {
 200                         vrele(wvp);
 201                         return (0);
 202                 }
 203                 MALLOC(ut->uu_nlminfo, struct nlminfo *,
 204                         sizeof(struct nlminfo), M_LOCKF, M_WAITOK | M_ZERO);
 205                 ut->uu_nlminfo->pid_start = p->p_stats->p_start;
 206         }
 207         /*
 208          * Fill in the information structure.
 209          */
 210         msg.lm_version = LOCKD_MSG_VERSION;
 211         msg.lm_msg_ident.pid = p->p_pid;
 212         msg.lm_msg_ident.ut = ut;
 213         msg.lm_msg_ident.pid_start = ut->uu_nlminfo->pid_start;
 214         msg.lm_msg_ident.msg_seq = ++(ut->uu_nlminfo->msg_seq);
 215
 216         /*
 217          * The NFS Lock Manager protocol doesn't directly handle
 218          * negative lengths or SEEK_END, so we need to normalize
 219          * things here where we have all the info.
 220          * (Note: SEEK_CUR is already adjusted for at this point)
 221          */
 222         /* Convert the flock structure into a start and end. */
 223         switch (fl->l_whence) {
 224         case SEEK_SET:
 225         case SEEK_CUR:
 226                 /*
 227                  * Caller is responsible for adding any necessary offset
 228                  * to fl->l_start when SEEK_CUR is used.
 229                  */
 230                 start = fl->l_start;
 231                 break;
 232         case SEEK_END:
 233                 /* need to flush, and refetch attributes to make */
 234                 /* sure we have the correct end of file offset   */
 235                 if (np->n_flag & NMODIFIED) {
 236                         np->n_xid = 0;
 237                         error = nfs_vinvalbuf(vp, V_SAVE, p->p_ucred, p, 1);
 238                         if (error) {
 239                                 vrele(wvp);
 240                                 return (error);
 241                         }
 242                 }
 243                 np->n_xid = 0;
 244                 error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
 245                 if (error) {
 246                         vrele(wvp);
 247                         return (error);
 248                 }
 249                 start = np->n_size + fl->l_start;
 250                 break;
 251         default:
 252                 vrele(wvp);
 253                 return (EINVAL);
 254         }
 255         if (fl->l_len == 0)
 256                 end = -1;
 257         else if (fl->l_len > 0)
 258                 end = start + fl->l_len - 1;
 259         else { /* l_len is negative */
 260                 end = start - 1;
 261                 start += fl->l_len;
 262         }
 263         if (start < 0) {
 264                 vrele(wvp);
 265                 return (EINVAL);
 266         }
 267
 268         msg.lm_fl = *fl;
 269         msg.lm_fl.l_start = start;
 270         if (end != -1)
 271                 msg.lm_fl.l_len = end - start + 1;
 272
 273         msg.lm_wait = ap->a_flags & F_WAIT;
 274         msg.lm_getlk = ap->a_op == F_GETLK;
 275
 276         nmp = VFSTONFS(vp->v_mount);
 277         if (!nmp) {
 278                 vrele(wvp);
 279                 return (ENXIO);
 280         }
 281
 282         bcopy(mtod(nmp->nm_nam, struct sockaddr *), &msg.lm_addr,
 283               min(sizeof msg.lm_addr,
 284                   mtod(nmp->nm_nam, struct sockaddr *)->sa_len));
 285         msg.lm_fh_len = NFS_ISV3(vp) ? VTONFS(vp)->n_fhsize : NFSX_V2FH;
 286         bcopy(VTONFS(vp)->n_fhp, msg.lm_fh, msg.lm_fh_len);
 287         msg.lm_nfsv3 = NFS_ISV3(vp);
 288         cru2x(p->p_ucred, &msg.lm_cred);
 289
 290         microuptime(&ut->uu_nlminfo->nlm_lockstart);
 291
 292         fmode = FFLAGS(O_WRONLY);
 293         if ((error = VOP_OPEN(wvp, fmode, kernproc->p_ucred, p))) {
 294                 vrele(wvp);
 295                 return (error);
 296         }
 297         ++wvp->v_writecount;
 298
 299 #define IO_NOMACCHECK 0;
 300         ioflg = IO_UNIT | IO_NOMACCHECK;
 301         for (;;) {
 302                 VOP_LEASE(wvp, p, kernproc->p_ucred, LEASE_WRITE);
 303
 304                 while (nfslockdfifolock & NFSLOCKDFIFOLOCK_LOCKED) {
 305                         nfslockdfifolock |= NFSLOCKDFIFOLOCK_WANT;
 306                         if (tsleep((void *)&nfslockdfifolock, PCATCH | PUSER, "lockdfifo", 20*hz))
 307                                 break;
 308                 }
 309                 nfslockdfifolock |= NFSLOCKDFIFOLOCK_LOCKED;
 310
 311                 error = vn_rdwr(UIO_WRITE, wvp, (caddr_t)&msg, sizeof(msg), 0,
 312                     UIO_SYSSPACE, ioflg, kernproc->p_ucred, NULL, p);
 313
 314                 nfslockdfifowritten = 1;
 315
 316                 nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_LOCKED;
 317                 if (nfslockdfifolock & NFSLOCKDFIFOLOCK_WANT) {
 318                         nfslockdfifolock &= ~NFSLOCKDFIFOLOCK_WANT;
 319                         wakeup((void *)&nfslockdfifolock);
 320                 }
 321                 /* wake up lock daemon */
 322                 if (nfslockdwaiting)
 323                         (void)wakeup((void *)&nfslockdwaiting);
 324
 325                 if (error && (((ioflg & IO_NDELAY) == 0) || error != EAGAIN)) {
 326                         break;
 327                 }
 328                 /*
 329                  * If we're locking a file, wait for an answer.  Unlocks succeed
 330                  * immediately.
 331                  */
 332                 if (fl->l_type == F_UNLCK)
 333                         /*
 334                          * XXX this isn't exactly correct.  The client side
 335                          * needs to continue sending it's unlock until
 336                          * it gets a response back.
 337                          */
 338                         break;
 339
 340                 /*
 341                  * retry after 20 seconds if we haven't gotten a response yet.
 342                  * This number was picked out of thin air... but is longer
 343                  * then even a reasonably loaded system should take (at least
 344                  * on a local network).  XXX Probably should use a back-off
 345                  * scheme.
 346                  */
 347                 if ((error = tsleep((void *)ut->uu_nlminfo,
 348                                     PCATCH | PUSER, "lockd", 20*hz)) != 0) {
 349                         if (error == EWOULDBLOCK) {
 350                                 /*
 351                                  * We timed out, so we rewrite the request
 352                                  * to the fifo, but only if it isn't already
 353                                  * full.
 354                                  */
 355                                 ioflg |= IO_NDELAY;
 356                                 continue;
 357                         }
 358
 359                         break;
 360                 }
 361
 362                 if (msg.lm_getlk && ut->uu_nlminfo->retcode == 0) {
 363                         if (ut->uu_nlminfo->set_getlk) {
 364                                 fl->l_pid = ut->uu_nlminfo->getlk_pid;
 365                                 fl->l_start = ut->uu_nlminfo->getlk_start;
 366                                 fl->l_len = ut->uu_nlminfo->getlk_len;
 367                                 fl->l_whence = SEEK_SET;
 368                         } else {
 369                                 fl->l_type = F_UNLCK;
 370                         }
 371                 }
 372                 error = ut->uu_nlminfo->retcode;
 373                 break;
 374         }
 375
 376         /* XXX stats */
 377         nfsadvlocks++;
 378         microuptime(&elapsed);
 379         timevalsub(&elapsed, &ut->uu_nlminfo->nlm_lockstart);
 380         if (timevalcmp(&elapsed, &nfsadvlock_longest, >))
 381                 nfsadvlock_longest = elapsed;
 382         timevaladd(&nfsadvlocks_time, &elapsed);
 383         timerclear(&ut->uu_nlminfo->nlm_lockstart);
 384
 385         error1 = vn_close(wvp, FWRITE, kernproc->p_ucred, p);
 386         /* prefer any previous 'error' to our vn_close 'error1'. */
 387         return (error != 0 ? error : error1);
 388 }
 389
 390 /*
 391  * nfslockdans --
 392  *      NFS advisory byte-level locks answer from the lock daemon.
 393  */
 394 int
 395 nfslockdans(struct proc *p, struct lockd_ans *ansp)
 396 {
 397         struct proc *targetp;
 398         struct uthread *targetut, *uth;
 399         int error;
 400
 401         /*
 402          * Let root, or someone who once was root (lockd generally
 403          * switches to the daemon uid once it is done setting up) make
 404          * this call.
 405          *
 406          * XXX This authorization check is probably not right.
 407          */
 408         if ((error = suser(p->p_ucred, &p->p_acflag)) != 0 &&
 409             p->p_cred->p_svuid != 0)
 410                 return (error);
 411
 412         /* the version should match, or we're out of sync */
 413         if (ansp->la_vers != LOCKD_ANS_VERSION)
 414                 return (EINVAL);
 415
 416         /* Find the process & thread */
 417         if ((targetp = pfind(ansp->la_msg_ident.pid)) == NULL)
 418                 return (ESRCH);
 419         targetut = ansp->la_msg_ident.ut;
 420         TAILQ_FOREACH(uth, &targetp->p_uthlist, uu_list) {
 421                 if (uth == targetut)
 422                         break;
 423         }
 424         /*
 425          * Verify the pid hasn't been reused (if we can), and it isn't waiting
 426          * for an answer from a more recent request.  We return an EPIPE if
 427          * the match fails, because we've already used ESRCH above, and this
 428          * is sort of like writing on a pipe after the reader has closed it.
 429          * If only the seq# is off, don't return an error just return.  It could
 430          * just be a response to a retransmitted request.
 431          */
 432         if (uth == NULL || uth != targetut || targetut->uu_nlminfo == NULL)
 433                 return (EPIPE);
 434         if (ansp->la_msg_ident.msg_seq != -1) {
 435                 if (timevalcmp(&targetut->uu_nlminfo->pid_start,
 436                                &ansp->la_msg_ident.pid_start, !=))
 437                         return (EPIPE);
 438                 if (targetut->uu_nlminfo->msg_seq != ansp->la_msg_ident.msg_seq)
 439                         return (0);
 440         }
 441
 442         /* Found the thread, so set its return errno and wake it up. */
 443
 444         targetut->uu_nlminfo->retcode = ansp->la_errno;
 445         targetut->uu_nlminfo->set_getlk = ansp->la_getlk_set;
 446         targetut->uu_nlminfo->getlk_pid = ansp->la_getlk_pid;
 447         targetut->uu_nlminfo->getlk_start = ansp->la_getlk_start;
 448         targetut->uu_nlminfo->getlk_len = ansp->la_getlk_len;
 449
 450         (void)wakeup((void *)targetut->uu_nlminfo);
 451
 452         return (0);
 453 }
 454
 455 /*
 456  * nfslockdfd --
 457  *      NFS advisory byte-level locks: fifo file# from the lock daemon.
 458  */
 459 int
 460 nfslockdfd(struct proc *p, int fd)
 461 {
 462         int error;
 463         struct file *fp, *ofp;
 464
 465         error = suser(p->p_ucred, &p->p_acflag);
 466         if (error)
 467                 return (error);
 468         if (fd < 0) {
 469                 fp = 0;
 470         } else {
 471                 error = getvnode(p, fd, &fp);
 472                 if (error)
 473                         return (error);
 474                 (void)fref(fp);
 475         }
 476         ofp = nfslockdfp;
 477         nfslockdfp = fp;
 478         if (ofp)
 479                 (void)frele(ofp);
 480         nfslockdpid = nfslockdfp ? p->p_pid : 0;
 481         (void)wakeup((void *)&nfslockdfp);
 482         return (0);
 483 }
 484
 485 /*
 486  * nfslockdwait --
 487  *      lock daemon waiting for lock request
 488  */
 489 int
 490 nfslockdwait(struct proc *p)
 491 {
 492         int error;
 493         struct file *fp, *ofp;
 494
 495         if (p->p_pid != nfslockdpid) {
 496                 error = suser(p->p_ucred, &p->p_acflag);
 497                 if (error)
 498                         return (error);
 499         }
 500         if (nfslockdwaiting)
 501                 return (EBUSY);
 502         if (nfslockdfifowritten) {
 503                 nfslockdfifowritten = 0;
 504                 return (0);
 505         }
 506
 507         nfslockdwaiting = 1;
 508         tsleep((void *)&nfslockdwaiting, PCATCH | PUSER, "lockd", 0);
 509         nfslockdwaiting = 0;
 510
 511         return (0);
 512 }