bsd/kern/kern_lockf.c

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * This code is derived from software contributed to Berkeley by
  33  * Scooter Morris at Genentech Inc.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 4. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
  60  */
  61
  62 #include <sys/cdefs.h>
  63 #include <sys/param.h>
  64 #include <sys/systm.h>
  65 #include <sys/kernel.h>
  66 #include <sys/lock.h>
  67 #include <sys/mount.h>
  68 #include <sys/proc.h>
  69 #include <sys/signalvar.h>
  70 #include <sys/unistd.h>
  71 #include <sys/user.h>
  72 #include <sys/vnode.h>
  73 #include <sys/vnode_internal.h>
  74 #include <sys/vnode_if.h>
  75 #include <sys/malloc.h>
  76 #include <sys/fcntl.h>
  77 #include <sys/lockf.h>
  78
  79 /*
  80  * This variable controls the maximum number of processes that will
  81  * be checked in doing deadlock detection.
  82  */
  83 static int maxlockdepth = MAXDEPTH;
  84
  85 #ifdef LOCKF_DEBUGGING
  86 #include <sys/sysctl.h>
  87 #include <ufs/ufs/quota.h>
  88 #include <ufs/ufs/inode.h>
  89 void lf_print(const char *tag, struct lockf *lock);
  90 void lf_printlist(const char *tag, struct lockf *lock);
  91 static int      lockf_debug = 2;
  92 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
  93
  94 /*
  95  * If there is no mask bit selector, or there is on, and the selector is
  96  * set, then output the debugging diagnostic.
  97  */
  98 #define LOCKF_DEBUG(mask, ...)                                  \
  99         do {                                                    \
 100                 if( !(mask) || ((mask) & lockf_debug)) {        \
 101                         printf(__VA_ARGS__);                    \
 102                 }                                               \
 103         } while(0)
 104 #else   /* !LOCKF_DEBUGGING */
 105 #define LOCKF_DEBUG(mask, ...)          /* mask */
 106 #endif  /* !LOCKF_DEBUGGING */
 107
 108 MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
 109
 110 #define NOLOCKF (struct lockf *)0
 111 #define SELF    0x1
 112 #define OTHERS  0x2
 113 #define OFF_MAX 0x7fffffffffffffffULL   /* max off_t */
 114
 115 /*
 116  * Overlapping lock states
 117  */
 118 typedef enum {
 119         OVERLAP_NONE = 0,
 120         OVERLAP_EQUALS_LOCK,
 121         OVERLAP_CONTAINS_LOCK,
 122         OVERLAP_CONTAINED_BY_LOCK,
 123         OVERLAP_STARTS_BEFORE_LOCK,
 124         OVERLAP_ENDS_AFTER_LOCK
 125 } overlap_t;
 126
 127 static int       lf_clearlock(struct lockf *);
 128 static overlap_t lf_findoverlap(struct lockf *,
 129             struct lockf *, int, struct lockf ***, struct lockf **);
 130 static struct lockf *lf_getblock(struct lockf *);
 131 static int       lf_getlock(struct lockf *, struct flock *);
 132 static int       lf_setlock(struct lockf *);
 133 static int       lf_split(struct lockf *, struct lockf *);
 134 static void      lf_wakelock(struct lockf *, boolean_t);
 135
 136
 137 /*
 138  * in order to mitigate risk
 139  * don't switch to new wake-one method unless
 140  * we have at least this many waiters to wake up
 141  */
 142 #define SAFE_WAITER_LIMIT    20
 143
 144
 145 /*
 146  * lf_advlock
 147  *
 148  * Description: Advisory record locking support
 149  *
 150  * Parameters:  ap                      Argument pointer to a vnop_advlock_args
 151  *                                      argument descriptor structure for the
 152  *                                      lock operation to be attempted.
 153  *
 154  * Returns:     0                       Success
 155  *              EOVERFLOW
 156  *              EINVAL
 157  *              ENOLCK                  Number of locked regions exceeds limit
 158  *      lf_setlock:EAGAIN
 159  *      lf_setlock:EDEADLK
 160  *      lf_setlock:EINTR
 161  *      lf_setlock:ENOLCK
 162  *      lf_clearlock:ENOLCK
 163  *      vnode_size:???
 164  *
 165  * Notes:       We return ENOLCK when we run out of memory to support locks; as
 166  *              such, there is no specific expectation limit other than the
 167  *              amount of available resources.
 168  */
 169 int
 170 lf_advlock(struct vnop_advlock_args *ap)
 171 {
 172         struct vnode *vp = ap->a_vp;
 173         struct flock *fl = ap->a_fl;
 174         vfs_context_t context = ap->a_context;
 175         struct lockf *lock;
 176         off_t start, end, oadd;
 177         u_quad_t size;
 178         int error;
 179         struct lockf **head = &vp->v_lockf;
 180
 181         /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
 182
 183         /*
 184          * Avoid the common case of unlocking when inode has no locks.
 185          */
 186         if (*head == (struct lockf *)0) {
 187                 if (ap->a_op != F_SETLK) {
 188                         fl->l_type = F_UNLCK;
 189                         LOCKF_DEBUG(0, "lf_advlock: '%s' unlock without lock\n", vfs_context_proc(context)->p_comm);
 190                         return (0);
 191                 }
 192         }
 193
 194         /*
 195          * Convert the flock structure into a start and end.
 196          */
 197         switch (fl->l_whence) {
 198
 199         case SEEK_SET:
 200         case SEEK_CUR:
 201                 /*
 202                  * Caller is responsible for adding any necessary offset
 203                  * when SEEK_CUR is used.
 204                  */
 205                 start = fl->l_start;
 206                 break;
 207
 208         case SEEK_END:
 209
 210                 /*
 211                  * It's OK to cast the u_quad_t to and off_t here, since they
 212                  * are the same storage size, and the value of the returned
 213                  * contents will never overflow into the sign bit.  We need to
 214                  * do this because we will use size to force range checks.
 215                  */
 216                 if ((error = vnode_size(vp, (off_t *)&size, context))) {
 217                         LOCKF_DEBUG(0, "lf_advlock: vnode_getattr failed: %d\n", error);
 218                         return (error);
 219                 }
 220
 221                 if (size > OFF_MAX ||
 222                     (fl->l_start > 0 &&
 223                      size > (u_quad_t)(OFF_MAX - fl->l_start)))
 224                         return (EOVERFLOW);
 225                 start = size + fl->l_start;
 226                 break;
 227
 228         default:
 229                 LOCKF_DEBUG(0, "lf_advlock: unknown whence %d\n", fl->l_whence);
 230                 return (EINVAL);
 231         }
 232         if (start < 0) {
 233                 LOCKF_DEBUG(0, "lf_advlock: start < 0 (%qd)\n", start);
 234                 return (EINVAL);
 235         }
 236         if (fl->l_len < 0) {
 237                 if (start == 0) {
 238                         LOCKF_DEBUG(0, "lf_advlock: len < 0 & start == 0\n");
 239                         return (EINVAL);
 240                 }
 241                 end = start - 1;
 242                 start += fl->l_len;
 243                 if (start < 0) {
 244                         LOCKF_DEBUG(0, "lf_advlock: start < 0 (%qd)\n", start);
 245                         return (EINVAL);
 246                 }
 247         } else if (fl->l_len == 0)
 248                 end = -1;
 249         else {
 250                 oadd = fl->l_len - 1;
 251                 if (oadd > (off_t)(OFF_MAX - start)) {
 252                         LOCKF_DEBUG(0, "lf_advlock: overflow\n");
 253                         return (EOVERFLOW);
 254                 }
 255                 end = start + oadd;
 256         }
 257         /*
 258          * Create the lockf structure
 259          */
 260         MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
 261         if (lock == NULL)
 262                 return (ENOLCK);
 263         lock->lf_start = start;
 264         lock->lf_end = end;
 265         lock->lf_id = ap->a_id;
 266         lock->lf_vnode = vp;
 267         lock->lf_type = fl->l_type;
 268         lock->lf_head = head;
 269         lock->lf_next = (struct lockf *)0;
 270         lock->lf_waiters = 0;
 271         TAILQ_INIT(&lock->lf_blkhd);
 272         lock->lf_flags = ap->a_flags;
 273
 274         if (ap->a_flags & F_FLOCK)
 275                 lock->lf_flags |= F_WAKE1_SAFE;
 276
 277         lck_mtx_lock(&vp->v_lock);      /* protect the lockf list */
 278         /*
 279          * Do the requested operation.
 280          */
 281         switch(ap->a_op) {
 282         case F_SETLK:
 283                 error = lf_setlock(lock);
 284                 break;
 285
 286         case F_UNLCK:
 287                 error = lf_clearlock(lock);
 288                 FREE(lock, M_LOCKF);
 289                 break;
 290
 291         case F_GETLK:
 292                 error = lf_getlock(lock, fl);
 293                 FREE(lock, M_LOCKF);
 294                 break;
 295
 296         default:
 297                 FREE(lock, M_LOCKF);
 298                 error = EINVAL;
 299                 break;
 300         }
 301         lck_mtx_unlock(&vp->v_lock);    /* done maniplulating the list */
 302
 303         LOCKF_DEBUG(0, "lf_advlock: normal exit: %d\n\n", error);
 304         return (error);
 305 }
 306
 307
 308 /*
 309  * lf_coelesce_adjacent
 310  *
 311  * Description: Helper function: when setting a lock, coelesce adjacent
 312  *              locks.  Needed because adjacent locks are not overlapping,
 313  *              but POSIX requires that they be coelesced.
 314  *
 315  * Parameters:  lock                    The new lock which may be adjacent
 316  *                                      to already locked reagions, and which
 317  *                                      should therefore be coelesced with them
 318  *
 319  * Returns:     <void>
 320  */
 321 static void
 322 lf_coelesce_adjacent(struct lockf *lock)
 323 {
 324         struct lockf **lf = lock->lf_head;
 325
 326         while (*lf != NOLOCKF) {
 327                 /* reject locks that obviously could not be coelesced */
 328                 if ((*lf == lock) ||
 329                     ((*lf)->lf_id != lock->lf_id) ||
 330                     ((*lf)->lf_type != lock->lf_type)) {
 331                         lf = &(*lf)->lf_next;
 332                         continue;
 333                 }
 334
 335                 /* If the lock ends adjacent to us, we can coelesce it */
 336                 if ((*lf)->lf_end != -1 &&
 337                     ((*lf)->lf_end + 1) == lock->lf_start) {
 338                         struct lockf *adjacent = *lf;
 339
 340                         LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent previous\n");
 341                         lock->lf_start = (*lf)->lf_start;
 342                         *lf = lock;
 343                         lf = &(*lf)->lf_next;
 344                         FREE(adjacent, M_LOCKF);
 345                         continue;
 346                 }
 347                 /* If the lock starts adjacent to us, we can coelesce it */
 348                 if (lock->lf_end != -1 &&
 349                     (lock->lf_end + 1) == (*lf)->lf_start) {
 350                         struct lockf *adjacent = *lf;
 351
 352                         LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent following\n");
 353                         lock->lf_end = (*lf)->lf_end;
 354                         lock->lf_next = (*lf)->lf_next;
 355                         lf = &lock->lf_next;
 356                         FREE(adjacent, M_LOCKF);
 357                         continue;
 358                 }
 359
 360                 /* no matching conditions; go on to next lock */
 361                 lf = &(*lf)->lf_next;
 362         }
 363 }
 364
 365
 366 /*
 367  * lf_setlock
 368  *
 369  * Description: Set a byte-range lock.
 370  *
 371  * Parameters:  lock                    The lock structure describing the lock
 372  *                                      to be set; allocated by the caller, it
 373  *                                      will be linked into the lock list if
 374  *                                      the set is successful, and freed if the
 375  *                                      set is unsuccessful.
 376  *
 377  * Returns:     0                       Success
 378  *              EAGAIN
 379  *              EDEADLK
 380  *      lf_split:ENOLCK
 381  *      lf_clearlock:ENOLCK
 382  *      msleep:EINTR
 383  *
 384  * Notes:       We add the lock to the provisional lock list.  We do not
 385  *              coelesce at this time; this has implications for other lock
 386  *              requestors in the blocker search mechanism.
 387  */
 388 static int
 389 lf_setlock(struct lockf *lock)
 390 {
 391         struct lockf *block;
 392         struct lockf **head = lock->lf_head;
 393         struct lockf **prev, *overlap, *ltmp;
 394         static char lockstr[] = "lockf";
 395         int priority, needtolink, error;
 396         struct vnode *vp = lock->lf_vnode;
 397         overlap_t ovcase;
 398
 399 #ifdef LOCKF_DEBUGGING
 400         if (lockf_debug & 1) {
 401                 lf_print("lf_setlock", lock);
 402                 lf_printlist("lf_setlock(in)", lock);
 403         }
 404 #endif /* LOCKF_DEBUGGING */
 405
 406         /*
 407          * Set the priority
 408          */
 409         priority = PLOCK;
 410         if (lock->lf_type == F_WRLCK)
 411                 priority += 4;
 412         priority |= PCATCH;
 413         /*
 414          * Scan lock list for this file looking for locks that would block us.
 415          */
 416         while ((block = lf_getblock(lock))) {
 417                 /*
 418                  * Free the structure and return if nonblocking.
 419                  */
 420                 if ((lock->lf_flags & F_WAIT) == 0) {
 421                         FREE(lock, M_LOCKF);
 422                         return (EAGAIN);
 423                 }
 424
 425                 /*
 426                  * We are blocked. Since flock style locks cover
 427                  * the whole file, there is no chance for deadlock.
 428                  * For byte-range locks we must check for deadlock.
 429                  *
 430                  * Deadlock detection is done by looking through the
 431                  * wait channels to see if there are any cycles that
 432                  * involve us. MAXDEPTH is set just to make sure we
 433                  * do not go off into neverland.
 434                  */
 435                 if ((lock->lf_flags & F_POSIX) &&
 436                     (block->lf_flags & F_POSIX)) {
 437                         struct proc *wproc, *bproc;
 438                         struct uthread *ut;
 439                         struct lockf *waitblock;
 440                         int i = 0;
 441
 442                         /* The block is waiting on something */
 443                         wproc = (struct proc *)block->lf_id;
 444                         proc_lock(wproc);
 445                         TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
 446                                 /*
 447                                  * While the thread is asleep (uu_wchan != 0)
 448                                  * in this code (uu_wmesg == lockstr)
 449                                  * and we have not exceeded the maximum cycle
 450                                  * depth (i < maxlockdepth), then check for a
 451                                  * cycle to see if the lock is blocked behind
 452                                  * someone blocked behind us.
 453                                  */
 454                                 while (((waitblock = (struct lockf *)ut->uu_wchan) != NULL) &&
 455                                     ut->uu_wmesg == lockstr &&
 456                                     (i++ < maxlockdepth)) {
 457                                         waitblock = (struct lockf *)ut->uu_wchan;
 458                                         /*
 459                                          * Get the lock blocking the lock
 460                                          * which would block us, and make
 461                                          * certain it hasn't come unblocked
 462                                          * (been granted, e.g. between the time
 463                                          * we called lf_getblock, and the time
 464                                          * we successfully acquired the
 465                                          * proc_lock).
 466                                          */
 467                                         waitblock = waitblock->lf_next;
 468                                         if (waitblock == NULL)
 469                                                 break;
 470
 471                                         /*
 472                                          * Make sure it's an advisory range
 473                                          * lock and not an overall file lock;
 474                                          * if we mix lock types, it's our own
 475                                          * fault.
 476                                          */
 477                                         if ((waitblock->lf_flags & F_POSIX) == 0)
 478                                                 break;
 479
 480                                         /*
 481                                          * If the owner of the lock that's
 482                                          * blocking a lock that's blocking us
 483                                          * getting the requested lock, then we
 484                                          * would deadlock, so error out.
 485                                          */
 486                                         bproc = (struct proc *)waitblock->lf_id;
 487                                         if (bproc == (struct proc *)lock->lf_id) {
 488                                                 proc_unlock(wproc);
 489                                                 FREE(lock, M_LOCKF);
 490                                                 return (EDEADLK);
 491                                         }
 492                                 }
 493                         }
 494                         proc_unlock(wproc);
 495                 }
 496
 497                 /*
 498                  * For flock type locks, we must first remove
 499                  * any shared locks that we hold before we sleep
 500                  * waiting for an exclusive lock.
 501                  */
 502                 if ((lock->lf_flags & F_FLOCK) &&
 503                     lock->lf_type == F_WRLCK) {
 504                         lock->lf_type = F_UNLCK;
 505                         if ((error = lf_clearlock(lock)) != 0) {
 506                                 FREE(lock, M_LOCKF);
 507                                 return (error);
 508                         }
 509                         lock->lf_type = F_WRLCK;
 510                 }
 511                 /*
 512                  * Add our lock to the blocked list and sleep until we're free.
 513                  * Remember who blocked us (for deadlock detection).
 514                  */
 515                 lock->lf_next = block;
 516                 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
 517                 block->lf_waiters++;
 518
 519                 if ( !(lock->lf_flags & F_FLOCK))
 520                         block->lf_flags &= ~F_WAKE1_SAFE;
 521
 522 #ifdef LOCKF_DEBUGGING
 523                 if (lockf_debug & 1) {
 524                         lf_print("lf_setlock: blocking on", block);
 525                         lf_printlist("lf_setlock(block)", block);
 526                 }
 527 #endif /* LOCKF_DEBUGGING */
 528                 error = msleep(lock, &vp->v_lock, priority, lockstr, 0);
 529
 530                 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
 531                         struct lockf *tlock;
 532
 533                         if ((block = lf_getblock(lock))) {
 534                                 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
 535                                         tlock->lf_next = block;
 536                                 }
 537                                 TAILQ_CONCAT(&block->lf_blkhd, &lock->lf_blkhd, lf_block);
 538
 539                                 block->lf_waiters += lock->lf_waiters;
 540                                 lock->lf_waiters = 0;
 541                         }
 542                 }
 543                 if (error) {    /* XXX */
 544                         /*
 545                          * We may have been awakened by a signal and/or by a
 546                          * debugger continuing us (in which cases we must remove
 547                          * ourselves from the blocked list) and/or by another
 548                          * process releasing a lock (in which case we have
 549                          * already been removed from the blocked list and our
 550                          * lf_next field set to NOLOCKF).
 551                          */
 552                         if (lock->lf_next) {
 553                                 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
 554                                 lock->lf_next->lf_waiters--;
 555                                 lock->lf_next = NOLOCKF;
 556                         }
 557                         if (!TAILQ_EMPTY(&lock->lf_blkhd))
 558                                 lf_wakelock(lock, TRUE);
 559
 560                         FREE(lock, M_LOCKF);
 561                         return (error);
 562                 }       /* XXX */
 563         }
 564         /*
 565          * No blocks!!  Add the lock.  Note that we will
 566          * downgrade or upgrade any overlapping locks this
 567          * process already owns.
 568          *
 569          * Skip over locks owned by other processes.
 570          * Handle any locks that overlap and are owned by ourselves.
 571          */
 572         prev = head;
 573         block = *head;
 574         needtolink = 1;
 575         for (;;) {
 576                 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
 577                 if (ovcase)
 578                         block = overlap->lf_next;
 579                 /*
 580                  * Six cases:
 581                  *      0) no overlap
 582                  *      1) overlap == lock
 583                  *      2) overlap contains lock
 584                  *      3) lock contains overlap
 585                  *      4) overlap starts before lock
 586                  *      5) overlap ends after lock
 587                  */
 588                 switch (ovcase) {
 589                 case OVERLAP_NONE:
 590                         if (needtolink) {
 591                                 *prev = lock;
 592                                 lock->lf_next = overlap;
 593                         }
 594                         break;
 595
 596                 case OVERLAP_EQUALS_LOCK:
 597                         /*
 598                          * If downgrading lock, others may be
 599                          * able to acquire it.
 600                          */
 601                         if (lock->lf_type == F_RDLCK &&
 602                             overlap->lf_type == F_WRLCK)
 603                                 lf_wakelock(overlap, TRUE);
 604                         overlap->lf_type = lock->lf_type;
 605                         FREE(lock, M_LOCKF);
 606                         lock = overlap; /* for lf_coelesce_adjacent() */
 607                         break;
 608
 609                 case OVERLAP_CONTAINS_LOCK:
 610                         /*
 611                          * Check for common starting point and different types.
 612                          */
 613                         if (overlap->lf_type == lock->lf_type) {
 614                                 FREE(lock, M_LOCKF);
 615                                 lock = overlap; /* for lf_coelesce_adjacent() */
 616                                 break;
 617                         }
 618                         if (overlap->lf_start == lock->lf_start) {
 619                                 *prev = lock;
 620                                 lock->lf_next = overlap;
 621                                 overlap->lf_start = lock->lf_end + 1;
 622                         } else {
 623                                 /*
 624                                  * If we can't split the lock, we can't
 625                                  * grant it.  Claim a system limit for the
 626                                  * resource shortage.
 627                                  */
 628                                 if (lf_split(overlap, lock)) {
 629                                         FREE(lock, M_LOCKF);
 630                                         return (ENOLCK);
 631                                 }
 632                         }
 633                         lf_wakelock(overlap, TRUE);
 634                         break;
 635
 636                 case OVERLAP_CONTAINED_BY_LOCK:
 637                         /*
 638                          * If downgrading lock, others may be able to
 639                          * acquire it, otherwise take the list.
 640                          */
 641                         if (lock->lf_type == F_RDLCK &&
 642                             overlap->lf_type == F_WRLCK) {
 643                                 lf_wakelock(overlap, TRUE);
 644                         } else {
 645                                 while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
 646                                         ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
 647                                         TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
 648                                             lf_block);
 649                                         overlap->lf_waiters--;
 650
 651                                         TAILQ_INSERT_TAIL(&lock->lf_blkhd,
 652                                             ltmp, lf_block);
 653                                         lock->lf_waiters++;
 654
 655                                         ltmp->lf_next = lock;
 656                                 }
 657                         }
 658                         /*
 659                          * Add the new lock if necessary and delete the overlap.
 660                          */
 661                         if (needtolink) {
 662                                 *prev = lock;
 663                                 lock->lf_next = overlap->lf_next;
 664                                 prev = &lock->lf_next;
 665                                 needtolink = 0;
 666                         } else
 667                                 *prev = overlap->lf_next;
 668                         FREE(overlap, M_LOCKF);
 669                         continue;
 670
 671                 case OVERLAP_STARTS_BEFORE_LOCK:
 672                         /*
 673                          * Add lock after overlap on the list.
 674                          */
 675                         lock->lf_next = overlap->lf_next;
 676                         overlap->lf_next = lock;
 677                         overlap->lf_end = lock->lf_start - 1;
 678                         prev = &lock->lf_next;
 679                         lf_wakelock(overlap, TRUE);
 680                         needtolink = 0;
 681                         continue;
 682
 683                 case OVERLAP_ENDS_AFTER_LOCK:
 684                         /*
 685                          * Add the new lock before overlap.
 686                          */
 687                         if (needtolink) {
 688                                 *prev = lock;
 689                                 lock->lf_next = overlap;
 690                         }
 691                         overlap->lf_start = lock->lf_end + 1;
 692                         lf_wakelock(overlap, TRUE);
 693                         break;
 694                 }
 695                 break;
 696         }
 697         /* Coelesce adjacent locks with identical attributes */
 698         lf_coelesce_adjacent(lock);
 699 #ifdef LOCKF_DEBUGGING
 700         if (lockf_debug & 1) {
 701                 lf_print("lf_setlock: got the lock", lock);
 702                 lf_printlist("lf_setlock(out)", lock);
 703         }
 704 #endif /* LOCKF_DEBUGGING */
 705         return (0);
 706 }
 707
 708
 709 /*
 710  * lf_clearlock
 711  *
 712  * Description: Remove a byte-range lock on an vnode.  Generally, find the
 713  *              lock (or an overlap to that lock) and remove it (or shrink
 714  *              it), then wakeup anyone we can.
 715  *
 716  * Parameters:  unlock                  The lock to clear
 717  *
 718  * Returns:     0                       Success
 719  *      lf_split:ENOLCK
 720  *
 721  * Notes:       A caller may unlock all the locks owned by the caller by
 722  *              specifying the entire file range; locks owned by other
 723  *              callers are not effected by this operation.
 724  */
 725 static int
 726 lf_clearlock(struct lockf *unlock)
 727 {
 728         struct lockf **head = unlock->lf_head;
 729         struct lockf *lf = *head;
 730         struct lockf *overlap, **prev;
 731         overlap_t ovcase;
 732
 733         if (lf == NOLOCKF)
 734                 return (0);
 735 #ifdef LOCKF_DEBUGGING
 736         if (unlock->lf_type != F_UNLCK)
 737                 panic("lf_clearlock: bad type");
 738         if (lockf_debug & 1)
 739                 lf_print("lf_clearlock", unlock);
 740 #endif /* LOCKF_DEBUGGING */
 741         prev = head;
 742         while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
 743                 /*
 744                  * Wakeup the list of locks to be retried.
 745                  */
 746                 lf_wakelock(overlap, FALSE);
 747
 748                 switch (ovcase) {
 749                 case OVERLAP_NONE:      /* satisfy compiler enum/switch */
 750                         break;
 751
 752                 case OVERLAP_EQUALS_LOCK:
 753                         *prev = overlap->lf_next;
 754                         FREE(overlap, M_LOCKF);
 755                         break;
 756
 757                 case OVERLAP_CONTAINS_LOCK: /* split it */
 758                         if (overlap->lf_start == unlock->lf_start) {
 759                                 overlap->lf_start = unlock->lf_end + 1;
 760                                 break;
 761                         }
 762                         /*
 763                          * If we can't split the lock, we can't grant it.
 764                          * Claim a system limit for the resource shortage.
 765                          */
 766                         if (lf_split(overlap, unlock))
 767                                 return (ENOLCK);
 768                         overlap->lf_next = unlock->lf_next;
 769                         break;
 770
 771                 case OVERLAP_CONTAINED_BY_LOCK:
 772                         *prev = overlap->lf_next;
 773                         lf = overlap->lf_next;
 774                         FREE(overlap, M_LOCKF);
 775                         continue;
 776
 777                 case OVERLAP_STARTS_BEFORE_LOCK:
 778                         overlap->lf_end = unlock->lf_start - 1;
 779                         prev = &overlap->lf_next;
 780                         lf = overlap->lf_next;
 781                         continue;
 782
 783                 case OVERLAP_ENDS_AFTER_LOCK:
 784                         overlap->lf_start = unlock->lf_end + 1;
 785                         break;
 786                 }
 787                 break;
 788         }
 789 #ifdef LOCKF_DEBUGGING
 790         if (lockf_debug & 1)
 791                 lf_printlist("lf_clearlock", unlock);
 792 #endif /* LOCKF_DEBUGGING */
 793         return (0);
 794 }
 795
 796
 797 /*
 798  * lf_getlock
 799  *
 800  * Description: Check whether there is a blocking lock, and if so return
 801  *              its process identifier into the lock being requested.
 802  *
 803  * Parameters:  lock                    Pointer to lock to test for blocks
 804  *              fl                      Pointer to flock structure to receive
 805  *                                      the blocking lock information, if a
 806  *                                      blocking lock is found.
 807  *
 808  * Returns:     0                       Success
 809  *
 810  * Implicit Returns:
 811  *              *fl                     Contents modified to reflect the
 812  *                                      blocking lock, if one is found; not
 813  *                                      modified otherwise
 814  *
 815  * Notes:       fl->l_pid will be (-1) for file locks and will only be set to
 816  *              the blocking process ID for advisory record locks.
 817  */
 818 static int
 819 lf_getlock(struct lockf *lock, struct flock *fl)
 820 {
 821         struct lockf *block;
 822
 823 #ifdef LOCKF_DEBUGGING
 824         if (lockf_debug & 1)
 825                 lf_print("lf_getlock", lock);
 826 #endif /* LOCKF_DEBUGGING */
 827
 828         if ((block = lf_getblock(lock))) {
 829                 fl->l_type = block->lf_type;
 830                 fl->l_whence = SEEK_SET;
 831                 fl->l_start = block->lf_start;
 832                 if (block->lf_end == -1)
 833                         fl->l_len = 0;
 834                 else
 835                         fl->l_len = block->lf_end - block->lf_start + 1;
 836                 if (block->lf_flags & F_POSIX)
 837                         fl->l_pid = proc_pid((struct proc *)(block->lf_id));
 838                 else
 839                         fl->l_pid = -1;
 840         } else {
 841                 fl->l_type = F_UNLCK;
 842         }
 843         return (0);
 844 }
 845
 846
 847 /*
 848  * lf_getblock
 849  *
 850  * Description: Walk the list of locks for an inode and return the first
 851  *              blocking lock.  A lock is considered blocking if we are not
 852  *              the lock owner; otherwise, we are permitted to upgrade or
 853  *              downgrade it, and it's not considered blocking.
 854  *
 855  * Parameters:  lock                    The lock for which we are interested
 856  *                                      in obtaining the blocking lock, if any
 857  *
 858  * Returns:     NOLOCKF                 No blocking lock exists
 859  *              !NOLOCKF                The address of the blocking lock's
 860  *                                      struct lockf.
 861  */
 862 static struct lockf *
 863 lf_getblock(struct lockf *lock)
 864 {
 865         struct lockf **prev, *overlap, *lf = *(lock->lf_head);
 866         int ovcase;
 867
 868         prev = lock->lf_head;
 869         while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) != OVERLAP_NONE) {
 870                 /*
 871                  * We've found an overlap, see if it blocks us
 872                  */
 873                 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
 874                         return (overlap);
 875                 /*
 876                  * Nope, point to the next one on the list and
 877                  * see if it blocks us
 878                  */
 879                 lf = overlap->lf_next;
 880         }
 881         return (NOLOCKF);
 882 }
 883
 884
 885 /*
 886  * lf_findoverlap
 887  *
 888  * Description: Walk the list of locks to find an overlapping lock (if any).
 889  *
 890  * Parameters:  lf                      First lock on lock list
 891  *              lock                    The lock we are checking for an overlap
 892  *              check                   Check type
 893  *              prev                    pointer to pointer pointer to contain
 894  *                                      address of pointer to previous lock
 895  *                                      pointer to overlapping lock, if overlap
 896  *              overlap                 pointer to pointer to contain address
 897  *                                      of overlapping lock
 898  *
 899  * Returns:     OVERLAP_NONE
 900  *              OVERLAP_EQUALS_LOCK
 901  *              OVERLAP_CONTAINS_LOCK
 902  *              OVERLAP_CONTAINED_BY_LOCK
 903  *              OVERLAP_STARTS_BEFORE_LOCK
 904  *              OVERLAP_ENDS_AFTER_LOCK
 905  *
 906  * Implicit Returns:
 907  *              *prev                   The address of the next pointer in the
 908  *                                      lock previous to the overlapping lock;
 909  *                                      this is generally used to relink the
 910  *                                      lock list, avoiding a second iteration.
 911  *              *overlap                The pointer to the overlapping lock
 912  *                                      itself; this is ussed to return data in
 913  *                                      the check == OTHERS case, and for the
 914  *                                      caller to modify the overlapping lock,
 915  *                                      in the check == SELF case
 916  *
 917  * Note:        This returns only the FIRST overlapping lock.  There may be
 918  *              more than one.  lf_getlock will return the first blocking lock,
 919  *              while lf_setlock will iterate over all overlapping locks to
 920  *
 921  *              The check parameter can be SELF, meaning we are looking for
 922  *              overelapping locks owned by us, or it can be OTHERS, meaning
 923  *              we are looking for overlapping locks owned by someone else so
 924  *              we can report a blocking lock on an F_GETLK request.
 925  *
 926  *              The value of *overlap and *prev are modified, even if there is
 927  *              no overlapping lock found; always check the return code.
 928  */
 929 static overlap_t
 930 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
 931                struct lockf ***prev, struct lockf **overlap)
 932 {
 933         off_t start, end;
 934
 935         *overlap = lf;
 936         if (lf == NOLOCKF)
 937                 return (0);
 938 #ifdef LOCKF_DEBUGGING
 939         if (lockf_debug & 2)
 940                 lf_print("lf_findoverlap: looking for overlap in", lock);
 941 #endif /* LOCKF_DEBUGGING */
 942         start = lock->lf_start;
 943         end = lock->lf_end;
 944         while (lf != NOLOCKF) {
 945                 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
 946                     ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
 947                         *prev = &lf->lf_next;
 948                         *overlap = lf = lf->lf_next;
 949                         continue;
 950                 }
 951 #ifdef LOCKF_DEBUGGING
 952                 if (lockf_debug & 2)
 953                         lf_print("\tchecking", lf);
 954 #endif /* LOCKF_DEBUGGING */
 955                 /*
 956                  * OK, check for overlap
 957                  */
 958                 if ((lf->lf_end != -1 && start > lf->lf_end) ||
 959                     (end != -1 && lf->lf_start > end)) {
 960                         /* Case 0 */
 961                         LOCKF_DEBUG(2, "no overlap\n");
 962                         if ((type & SELF) && end != -1 && lf->lf_start > end)
 963                                 return (OVERLAP_NONE);
 964                         *prev = &lf->lf_next;
 965                         *overlap = lf = lf->lf_next;
 966                         continue;
 967                 }
 968                 if ((lf->lf_start == start) && (lf->lf_end == end)) {
 969                         LOCKF_DEBUG(2, "overlap == lock\n");
 970                         return (OVERLAP_EQUALS_LOCK);
 971                 }
 972                 if ((lf->lf_start <= start) &&
 973                     (end != -1) &&
 974                     ((lf->lf_end >= end) || (lf->lf_end == -1))) {
 975                         LOCKF_DEBUG(2, "overlap contains lock\n");
 976                         return (OVERLAP_CONTAINS_LOCK);
 977                 }
 978                 if (start <= lf->lf_start &&
 979                            (end == -1 ||
 980                            (lf->lf_end != -1 && end >= lf->lf_end))) {
 981                         LOCKF_DEBUG(2, "lock contains overlap\n");
 982                         return (OVERLAP_CONTAINED_BY_LOCK);
 983                 }
 984                 if ((lf->lf_start < start) &&
 985                         ((lf->lf_end >= start) || (lf->lf_end == -1))) {
 986                         LOCKF_DEBUG(2, "overlap starts before lock\n");
 987                         return (OVERLAP_STARTS_BEFORE_LOCK);
 988                 }
 989                 if ((lf->lf_start > start) &&
 990                         (end != -1) &&
 991                         ((lf->lf_end > end) || (lf->lf_end == -1))) {
 992                         LOCKF_DEBUG(2, "overlap ends after lock\n");
 993                         return (OVERLAP_ENDS_AFTER_LOCK);
 994                 }
 995                 panic("lf_findoverlap: default");
 996         }
 997         return (OVERLAP_NONE);
 998 }
 999
1000
1001 /*
1002  * lf_split
1003  *
1004  * Description: Split a lock and a contained region into two or three locks
1005  *              as necessary.
1006  *
1007  * Parameters:  lock1                   Lock to split
1008  *              lock2                   Overlapping lock region requiring the
1009  *                                      split (upgrade/downgrade/unlock)
1010  *
1011  * Returns:     0                       Success
1012  *              ENOLCK                  No memory for new lock
1013  *
1014  * Implicit Returns:
1015  *              *lock1                  Modified original lock
1016  *              *lock2                  Overlapping lock (inserted into list)
1017  *              (new lock)              Potential new lock inserted into list
1018  *                                      if split results in 3 locks
1019  *
1020  * Notes:       This operation can only fail if the split would result in three
1021  *              locks, and there is insufficient memory to allocate the third
1022  *              lock; in that case, neither of the locks will be modified.
1023  */
1024 static int
1025 lf_split(struct lockf *lock1, struct lockf *lock2)
1026 {
1027         struct lockf *splitlock;
1028
1029 #ifdef LOCKF_DEBUGGING
1030         if (lockf_debug & 2) {
1031                 lf_print("lf_split", lock1);
1032                 lf_print("splitting from", lock2);
1033         }
1034 #endif /* LOCKF_DEBUGGING */
1035         /*
1036          * Check to see if spliting into only two pieces.
1037          */
1038         if (lock1->lf_start == lock2->lf_start) {
1039                 lock1->lf_start = lock2->lf_end + 1;
1040                 lock2->lf_next = lock1;
1041                 return (0);
1042         }
1043         if (lock1->lf_end == lock2->lf_end) {
1044                 lock1->lf_end = lock2->lf_start - 1;
1045                 lock2->lf_next = lock1->lf_next;
1046                 lock1->lf_next = lock2;
1047                 return (0);
1048         }
1049         /*
1050          * Make a new lock consisting of the last part of
1051          * the encompassing lock
1052          */
1053         MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
1054         if (splitlock == NULL)
1055                 return (ENOLCK);
1056         bcopy(lock1, splitlock, sizeof *splitlock);
1057         splitlock->lf_start = lock2->lf_end + 1;
1058         TAILQ_INIT(&splitlock->lf_blkhd);
1059         lock1->lf_end = lock2->lf_start - 1;
1060         /*
1061          * OK, now link it in
1062          */
1063         splitlock->lf_next = lock1->lf_next;
1064         lock2->lf_next = splitlock;
1065         lock1->lf_next = lock2;
1066
1067         return (0);
1068 }
1069
1070
1071 /*
1072  * lf_wakelock
1073  *
1074  * Wakeup a blocklist in the case of a downgrade or unlock, since others
1075  * waiting on the lock may now be able to acquire it.
1076  *
1077  * Parameters:  listhead                Lock list head on which waiters may
1078  *                                      have pending locks
1079  *
1080  * Returns:     <void>
1081  *
1082  * Notes:       This function iterates a list of locks and wakes all waiters,
1083  *              rather than only waiters for the contended regions.  Because
1084  *              of this, for heavily contended files, this can result in a
1085  *              "thundering herd" situation.  Refactoring the code could make
1086  *              this operation more efficient, if heavy contention ever results
1087  *              in a real-world performance problem.
1088  */
1089 static void
1090 lf_wakelock(struct lockf *listhead, boolean_t force_all)
1091 {
1092         struct lockf *wakelock;
1093         boolean_t wake_all = TRUE;
1094
1095         if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE) && listhead->lf_waiters > SAFE_WAITER_LIMIT)
1096                 wake_all = FALSE;
1097
1098         while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1099                 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1100                 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1101                 listhead->lf_waiters--;
1102
1103                 wakelock->lf_next = NOLOCKF;
1104 #ifdef LOCKF_DEBUGGING
1105                 if (lockf_debug & 2)
1106                         lf_print("lf_wakelock: awakening", wakelock);
1107 #endif /* LOCKF_DEBUGGING */
1108                 if (wake_all == FALSE) {
1109
1110                         TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1111                         wakelock->lf_waiters = listhead->lf_waiters;
1112                         listhead->lf_waiters = 0;
1113
1114                         if (!TAILQ_EMPTY(&wakelock->lf_blkhd)) {
1115                                 struct lockf *tlock;
1116
1117                                 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1118                                         tlock->lf_next = wakelock;
1119                                 }
1120                         }
1121                 }
1122                 wakeup(wakelock);
1123
1124                 if (wake_all == FALSE)
1125                         break;
1126         }
1127 }
1128
1129
1130 #ifdef LOCKF_DEBUGGING
1131 /*
1132  * lf_print DEBUG
1133  *
1134  * Print out a lock; lock information is prefixed by the string in 'tag'
1135  *
1136  * Parameters:  tag                     A string tag for debugging
1137  *              lock                    The lock whose information should be
1138  *                                      displayed
1139  *
1140  * Returns:     <void>
1141  */
1142 void
1143 lf_print(const char *tag, struct lockf *lock)
1144 {
1145         printf("%s: lock %p for ", tag, (void *)lock);
1146         if (lock->lf_flags & F_POSIX)
1147                 printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
1148         else
1149                 printf("id %p", (void *)lock->lf_id);
1150         if (lock->lf_vnode != 0)
1151                 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1152                     lock->lf_vnode,
1153                     lock->lf_type == F_RDLCK ? "shared" :
1154                     lock->lf_type == F_WRLCK ? "exclusive" :
1155                     lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1156                     (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1157         else
1158                 printf(" %s, start 0x%016llx, end 0x%016llx",
1159                     lock->lf_type == F_RDLCK ? "shared" :
1160                     lock->lf_type == F_WRLCK ? "exclusive" :
1161                     lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1162                     (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1163         if (!TAILQ_EMPTY(&lock->lf_blkhd))
1164                 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1165         else
1166                 printf("\n");
1167 }
1168
1169
1170 /*
1171  * lf_printlist DEBUG
1172  *
1173  * Print out a lock list for the vnode associated with 'lock'; lock information
1174  * is prefixed by the string in 'tag'
1175  *
1176  * Parameters:  tag                     A string tag for debugging
1177  *              lock                    The lock whose vnode's lock list should
1178  *                                      be displayed
1179  *
1180  * Returns:     <void>
1181  */
1182 void
1183 lf_printlist(const char *tag, struct lockf *lock)
1184 {
1185         struct lockf *lf, *blk;
1186
1187         if (lock->lf_vnode == 0)
1188                 return;
1189
1190         printf("%s: Lock list for vno %p:\n",
1191             tag, lock->lf_vnode);
1192         for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1193                 printf("\tlock %p for ",(void *)lf);
1194                 if (lf->lf_flags & F_POSIX)
1195                         printf("proc %ld",
1196                             (long)((struct proc *)lf->lf_id)->p_pid);
1197                 else
1198                         printf("id %p", (void *)lf->lf_id);
1199                 printf(", %s, start 0x%016llx, end 0x%016llx",
1200                     lf->lf_type == F_RDLCK ? "shared" :
1201                     lf->lf_type == F_WRLCK ? "exclusive" :
1202                     lf->lf_type == F_UNLCK ? "unlock" :
1203                     "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
1204                 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1205                         printf("\n\t\tlock request %p for ", (void *)blk);
1206                         if (blk->lf_flags & F_POSIX)
1207                                 printf("proc %ld",
1208                                     (long)((struct proc *)blk->lf_id)->p_pid);
1209                         else
1210                                 printf("id %p", (void *)blk->lf_id);
1211                         printf(", %s, start 0x%016llx, end 0x%016llx",
1212                             blk->lf_type == F_RDLCK ? "shared" :
1213                             blk->lf_type == F_WRLCK ? "exclusive" :
1214                             blk->lf_type == F_UNLCK ? "unlock" :
1215                             "unknown", (intmax_t)blk->lf_start,
1216                             (intmax_t)blk->lf_end);
1217                         if (!TAILQ_EMPTY(&blk->lf_blkhd))
1218                                 panic("lf_printlist: bad list");
1219                 }
1220                 printf("\n");
1221         }
1222 }
1223 #endif /* LOCKF_DEBUGGING */